In [None]:
# this code is for evaluating model
# Load base vs fine-tuned models.
# Generate predictions for a sample of your dataset.
# Compare both using ROUGE scores.

In [None]:
!pip install torch==2.2.2 transformers==4.41.2 datasets==2.20.0 evaluate==0.4.2 --quiet

The json module is used to handle dataset files.
datasets.Dataset organizes the data into a Hugging Face-compatible format.
AutoTokenizer and AutoModelForSeq2SeqLM load a pretrained sequence-to-sequence model along with its tokenizer.
The evaluate library provides standardized metrics (like ROUGE and BLEU) to measure the model’s performance.
Finally, torch enables tensor computations and GPU acceleration, which are essential for training and inference.

In [None]:
import json
import evaluate
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

loading data set lung_cancer.jsonl and splitting it into train_test for evaluating our base model and fine tuned model

In [None]:
# Load dataset from JSONL
def load_dataset(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return Dataset.from_list(data)

dataset = load_dataset("lung_cancer.jsonl")   # <-- replace with your dataset
split_dataset = dataset.train_test_split(test_size=0.1)

# ✅ Use training data for evaluation
train_dataset = split_dataset["train"]
train_dataset

loading our base and fine tuned models and tokensizers 

In [None]:
# Base (pre-trained) model
base_model_name = "google/flan-t5-small"
base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name)

# Fine-tuned model
fine_tuned_path = "./fine_tuned_lung_cancer"
ft_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_path)
ft_model = AutoModelForSeq2SeqLM.from_pretrained(fine_tuned_path)

generating answers by both models

In [None]:
#generating answers for evaluating
def generate_answer(model, tokenizer, question):
    inputs = tokenizer(question, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

evaluating models using rouge metrics because it gives a better evaluation of model by comparing predicted answer and refrence answer 
rouge1 means word by word evaluating 
rouge2 means evaluating by short phrases 
rougeL means evaluating full sentences 

In [None]:
def evaluate_models(train_data, n_samples=50):
    rouge = evaluate.load("rouge")

    # Use a subset to keep it fast
    subset = train_data.select(range(min(n_samples, len(train_data))))

    base_preds, ft_preds, refs = [], [], []
    for example in subset:
        q = example["input"]
        ref = example["output"]

        base_pred = generate_answer(base_model, base_tokenizer, q)
        ft_pred = generate_answer(ft_model, ft_tokenizer, q)

        base_preds.append(base_pred)
        ft_preds.append(ft_pred)
        refs.append(ref)

    print("\n📊 Evaluation Results (ROUGE on TRAIN DATA):")

    base_scores = rouge.compute(predictions=base_preds, references=refs)
    ft_scores = rouge.compute(predictions=ft_preds, references=refs)

    print("\n--- Base Model ---")
    for k, v in base_scores.items():
        print(f"{k}: {v:.4f}")

    print("\n--- Fine-tuned Model ---")
    for k, v in ft_scores.items():
        print(f"{k}: {v:.4f}")
