In [1]:
%cd /content/drive/MyDrive/CropLeafDiseaseDetectionApp/crop-leaf-disease-detection

/content/drive/MyDrive/CropLeafDiseaseDetectionApp/crop-leaf-disease-detection


In [2]:
from fastai.vision.all import *
from pathlib import Path



In [None]:
# fastai helper for counting classes & images in data/PlantVillage/<class>/*.ext
data_path = Path.cwd()/'data'/'PlantVillage'

# Get all image files
files = get_image_files(data_path)

## OPTION A - Use Folder plan (clean and safe)
* Keep your class folders as-is for train/valid, and create a separate sibling folder for test.

* from_folder sees only PlantVillage/ → random train/valid split.
* Your test set lives outside that root, so it’s never seen during training/tuning.
* You still get a one-liner to evaluate on test via dls.test_dl(...).

In [None]:
# ================================================
# 1) Make a stratified test set (≈10% per class)
# ================================================

import random, shutil

root = Path('data')/'PlantVillage'
test_root = Path('data')/'PlantVillage_test'
test_pct = 0.10
seed = 42

random.seed(seed)
test_root.mkdir(parents=True, exist_ok=True)

# create class subfolders under PlantVillage_test
for cls_dir in root.iterdir():
    if cls_dir.is_dir():
        (test_root/cls_dir.name).mkdir(parents=True, exist_ok=True)

# collect images per class and move ~10% to test
for cls_dir in root.iterdir():
    if not cls_dir.is_dir():
        continue
    imgs = list(get_image_files(cls_dir))
    if not imgs:
        continue
    random.shuffle(imgs)
    n_test = max(1, int(len(imgs)*test_pct))
    for p in imgs[:n_test]:
        dst = test_root/cls_dir.name/p.name
        # use copy if you prefer to keep originals; move avoids duplicates
        shutil.move(str(p), str(dst))

print("Done. Test images moved to:", test_root)


## OPTION B - Use DataBlock
---

## 1. What is a `DataBlock` in fastai?

Think of `DataBlock` as a **blueprint** for how your data is transformed into a format that PyTorch models can use.
It answers 4 questions:

1. **What types of inputs/labels?**
   → (`ImageBlock`, `CategoryBlock`, `TextBlock`, etc.)
2. **Where to get the items from?**
   → `get_items=get_image_files`
3. **How to get labels?**
   → `get_y=parent_label` (folder name = label)
4. **How to split into train/valid?**
   → `splitter=RandomSplitter(...)`, `GrandparentSplitter(...)`, or custom logic.

On top of that, you can add:

* **Transforms before batching** → `item_tfms=Resize(224)`
* **Transforms after batching** → `batch_tfms=aug_transforms()`

So `DataBlock` is about **declarative setup** — you say *what you want*, and fastai handles the rest.

---

## 2. Why not just `ImageDataLoaders.from_folder`?

* `ImageDataLoaders.from_folder` is a **shortcut** built on top of `DataBlock`.
* It’s great for quick experiments (train/valid only).
* But it’s limited — if you want **3 splits (train/valid/test)** or unusual labeling logic, it won’t work directly.

That’s why I moved to `DataBlock` in the modified Option A. It gives **flexibility**.

---

## 3. Use of `DataBlock` in your PlantVillage case

* We need a **custom splitter** (train, valid, test).
* We need **flexible transforms** (later you’ll add augmentations like rotation, lighting, flips, etc.).
* You want a project that looks **professional** on your portfolio → showing a `DataBlock` setup highlights your control over data processing.

So here, `DataBlock` is useful because:

* It lets you plug in your **own split logic** (the 3-way split).
* It makes it easier to extend later (e.g., add data augmentation).
* It avoids manually reorganizing files into `train/valid/test` folders.

---

## 4. Analogy

* `ImageDataLoaders.from_folder` = **auto mode** in a DSLR camera. Quick, convenient, but limited.
* `DataBlock` = **manual mode**. You control ISO, shutter, aperture → flexible, professional, but needs more setup.

---

✅ In summary:
We use a `DataBlock` here **only because** you want a **3-way split** (train/valid/test) while keeping files in place. Otherwise, `from_folder` would be enough.

---

In [None]:
# ---- 1) Create a 3-way split function ----
def split_train_valid_test(files, valid_pct=0.2, test_pct=0.1, seed=42):
    random.seed(seed)
    files = list(files)
    random.shuffle(files)

    n = len(files)
    n_test = int(test_pct * n)
    n_valid = int(valid_pct * n)

    test = files[:n_test]
    valid = files[n_test:n_test+n_valid]
    train = files[n_test+n_valid:]
    return train, valid, test

splits = split_train_valid_test(files, valid_pct=0.2, test_pct=0.1)


# ---- 2) Build DataBlock ----
# DataBlock with IndexSplitter
  #fastai’s built-in splitters (RandomSplitter, GrandparentSplitter) only support 2-way splits.
  #So we use an IndexSplitter → you feed it which indices should go to validation.
  #Here we temporarily include valid + test together (so DataBlock is happy).

dblock = DataBlock(
    blocks=(ImageBlock, CategoryBlock),
    get_items=get_image_files,
    get_y=parent_label,
    splitter=IndexSplitter(splits[1] + splits[2]), # valid+test indices, we'll separate later
    item_tfms=Resize(224)
)

# ---- 3) Create DataLoaders ----
dls = dblock.dataloaders(data_path, bs=64)

# ---- 4) Separate test set into its own DataLoader ----
test_dl = dls.test_dl(splits[2])

# ---- 5) Check classes and sizes ----
dls.show_batch(max_n=9, figsize=(8,8))
print(f"Classes: {dls.vocab}")
print(f"Train size: {len(splits[0])}")
print(f"Valid size: {len(splits[1])}")
print(f"Test size:  {len(splits[2])}")
