In [3]:
# fine_tune_lung_cancer.ipynb

# ============================
# 📘 Fine-Tuning FLAN-T5(llm model) on Lung Cancer Q/A 
# ============================
# check readme.md for brief information (how this code is working)

In [None]:
# install packages 
!pip install torch==2.2.2 transformers==4.41.2 datasets==2.20.0 evaluate==0.4.2 numpy==1.26.4 --quiet

json → handle dataset format.
Dataset → organize data for training.
AutoTokenizer → convert text into tokens.
AutoModelForSeq2SeqLM → load a pretrained seq2seq model.
Seq2SeqTrainingArguments → set training parameters.
Seq2SeqTrainer → train/evaluate the model easily.
torch → core ML computations and GPU support.

In [None]:
# import packages 
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

loading dataset from lung_cancer.jsonl

In [None]:
# Load JSONL dataset
def load_dataset(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return Dataset.from_list(data)

dataset = load_dataset("lung_cancer.jsonl")  # <-- replace with your dataset path
dataset


loading model of google flan t5 which is small and faster and accurate 
using autotokenizer because it chooses the best tokenzier based on your model
making model seq2seq language modeling for QA

In [None]:
# loading model and numbers of tokens 
model_name = "google/flan-t5-small"   # small and efficient
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


in preprocessing we give the input and output to model for fine tuning max lengths defines maximum numbers of tokens which tokenizer will keep for each input

In [None]:
# preprocessing 
def preprocess(batch):
    inputs = tokenizer(batch["input"], max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(batch["output"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess, batched=True)
tokenized_dataset


spliting data into training and testing to check our model is working right and accurate or not

In [None]:
#spliting into train and test
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)


asigning arguments for training our model 

In [None]:
#training arguments 
args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",   # ✅ correct arg
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    push_to_hub=False
)


using huggging face seq2seq trainer which will automatically train our model accordng to the model and args and divides them into training and testing variables for training and evaluating fine tuned model

In [None]:
#trainer setup
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    tokenizer=tokenizer
)


train our modela and then save it as fine_tuned_lung_cancer model 

In [None]:
#train and save model as fine_tuned_lung_cancer
trainer.train()
model.save_pretrained("./fine_tuned_lung_cancer")
tokenizer.save_pretrained("./fine_tuned_lung_cancer")

print("✅ Training complete! Model saved in ./fine_tuned_lung_cancer")