<a href="https://colab.research.google.com/github/manasa190/Advertising-Sales-Prediction/blob/main/stem_disease_cnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# download_and_build_stemify_improved.py
import kagglehub
from pathlib import Path
import shutil, random, os, zipfile, sys
from PIL import Image
from tqdm import tqdm

# ----------------- CONFIG -----------------
DATASET_SLUGS = {
    "rot":   "shrupyag001/philippines-rice-diseases",          # stem rot
    "rust":  "kushagra3204/wheat-plant-diseases",             # stem rust
    "canker":"nguynphancminh/dragonfruitdataset-qma",         # stem canker
    "wilt":  "testtech4biz/biotic-and-abiotic-factors-of-tomato-disease",  # wilt + biotic tomato
    "cotton":"dhamur/cotton-plant-disease",                   # cotton (blight/healthy)
    # optional extras
    #"mango": "chiragchouhan/mangofruitdds"
}

# canonical classes -> name patterns to match inside datasets (case-insensitive)
CLASS_PATTERNS = {
    "Stem_Rust": ["stem rust", "black rust", "stem_rust", "rust"],
    "Stem_Rot":  ["stem rot", "stem_end_rot", "soft rot", "sheath rot", "rot"],
    "Stem_Canker":["stem canker", "canker"],
    "Wilt":      ["wilt", "fusarium wilt", "bacterial wilt"],
    "Blight":    ["blight", "anthracnose", "early blight", "late blight", "leaf blight", "alternaria"],
    "Healthy":   ["healthy", "fresh"]
}

OUT_BASE = Path("data") / "STEMIFY_Dataset"
TRAIN_DIR = OUT_BASE / "Train"
TEST_DIR  = OUT_BASE / "Test"
IMG_SIZE = (224, 224)
TRAIN_SPLIT = 0.8
DRY_RUN = False   # set True if you want to see matches without copying
# ------------------------------------------

random.seed(42)

def safe_mkdir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

safe_mkdir(TRAIN_DIR)
safe_mkdir(TEST_DIR)

def extract_if_zip(p: Path) -> Path:
    """
    If p is a .zip file, extract to sibling folder and return extracted folder path.
    If p is a directory, return p.
    """
    if p.is_file() and p.suffix.lower() == '.zip':
        out_dir = p.with_suffix('')  # remove .zip
        if out_dir.exists():
            return out_dir
        print(f"Extracting {p} -> {out_dir}")
        with zipfile.ZipFile(p, 'r') as z:
            z.extractall(out_dir)
        return out_dir
    return p

def find_folder_matches(root: Path, patterns):
    """
    Recursively find directories under `root` whose folder-name contains ANY pattern token.
    Returns list of Path objects (directories).
    """
    found = []
    if not root.exists():
        return found
    for d in root.rglob('*'):
        if not d.is_dir():
            continue
        name = d.name.lower()
        for token in patterns:
            if token.lower() in name:
                found.append(d)
                break
    return found

downloaded = {}
print("Downloading datasets (kagglehub)...")
for key, slug in DATASET_SLUGS.items():
    try:
        # kagglehub returns a path string; may be a directory or a zip file path
        raw = kagglehub.dataset_download(slug)
        ds_path = Path(raw)
        if not ds_path.exists():
            print(f"Warning: downloaded path not found on disk: {ds_path}")
        ds_path = extract_if_zip(ds_path)
        downloaded[key] = ds_path
        print(f"  • {slug} → {ds_path}")
    except Exception as e:
        print(f"Download failed for {slug}: {e}")

# Temporary collector
TEMP = Path(".temp_stemify_all")
if TEMP.exists():
    shutil.rmtree(TEMP)
TEMP.mkdir(parents=True, exist_ok=True)

print("\nScanning downloaded datasets for matching class folders...")
for key, ds_path in downloaded.items():
    if not ds_path.exists():
        print(f"  ! skipping {key} because path not found: {ds_path}")
        continue
    for class_name, patterns in CLASS_PATTERNS.items():
        matches = find_folder_matches(ds_path, patterns)
        for m in matches:
            dest = TEMP / class_name / f"{key}_{m.name.replace(' ','_')}"
            safe_mkdir(dest)
            copied = 0
            for f in m.glob("*"):
                if f.suffix.lower() in (".jpg", ".jpeg", ".png", ".bmp", ".webp"):
                    if DRY_RUN:
                        copied += 1
                        continue
                    try:
                        shutil.copy2(f, dest / f.name)
                        copied += 1
                    except Exception:
                        continue
            if copied:
                print(f"  • {copied} img(s) → {class_name} from {key}/{m.name}")
            else:
                # no files found in that matched folder
                pass

# Merge, shuffle, split, resize
print("\nMerging classes, splitting train/test, resizing to", IMG_SIZE, "...")
for class_name in CLASS_PATTERNS.keys():
    class_temp_path = TEMP / class_name
    if not class_temp_path.exists():
        print(f"  ! No temp folder for class {class_name}")
        continue

    # collect all image files under all matched subfolders
    all_imgs = [p for p in class_temp_path.rglob("*") if p.suffix.lower() in (".jpg", ".jpeg", ".png", ".bmp", ".webp")]
    if not all_imgs:
        print(f"  ! No images found for class {class_name}")
        continue

    random.shuffle(all_imgs)
    cut = int(TRAIN_SPLIT * len(all_imgs))
    train_imgs = all_imgs[:cut]
    test_imgs  = all_imgs[cut:]

    # helper to resize & save
    def save_list(img_list, dest_base):
        dest_folder = dest_base / class_name
        safe_mkdir(dest_folder)
        for i, p in enumerate(tqdm(img_list, desc=f"Saving {class_name} -> {dest_folder.name}", unit='img')):
            try:
                img = Image.open(p).convert("RGB")
                img = img.resize(IMG_SIZE, Image.LANCZOS)
                fname = f"{class_name.lower()}_{i:06d}{p.suffix.lower()}"
                out_path = dest_folder / fname
                img.save(out_path, quality=95)
            except Exception:
                continue

    save_list(train_imgs, TRAIN_DIR)
    save_list(test_imgs, TEST_DIR)

# cleanup
try:
    shutil.rmtree(TEMP)
except Exception:
    pass

print("\nDone. Dataset folders at:")
print(" Train:", TRAIN_DIR.resolve())
print(" Test :", TEST_DIR.resolve())

# quick counts
for cls in sorted(TRAIN_DIR.iterdir()):
    if cls.is_dir():
        print(f"  {cls.name} train: {len(list(cls.glob('*')))} images")
for cls in sorted(TEST_DIR.iterdir()):
    if cls.is_dir():
        print(f"  {cls.name} test: {len(list(cls.glob('*')))} images")


Downloading datasets (kagglehub)...
Downloading from https://www.kaggle.com/api/v1/datasets/download/shrupyag001/philippines-rice-diseases?dataset_version_number=6...


100%|██████████| 46.8M/46.8M [00:00<00:00, 191MB/s]

Extracting files...





  • shrupyag001/philippines-rice-diseases → /root/.cache/kagglehub/datasets/shrupyag001/philippines-rice-diseases/versions/6
Downloading from https://www.kaggle.com/api/v1/datasets/download/kushagra3204/wheat-plant-diseases?dataset_version_number=6...


100%|██████████| 6.09G/6.09G [00:41<00:00, 158MB/s]

Extracting files...





  • kushagra3204/wheat-plant-diseases → /root/.cache/kagglehub/datasets/kushagra3204/wheat-plant-diseases/versions/6
Downloading from https://www.kaggle.com/api/v1/datasets/download/nguynphancminh/dragonfruitdataset-qma?dataset_version_number=1...


100%|██████████| 929M/929M [00:11<00:00, 87.7MB/s]

Extracting files...





  • nguynphancminh/dragonfruitdataset-qma → /root/.cache/kagglehub/datasets/nguynphancminh/dragonfruitdataset-qma/versions/1
Downloading from https://www.kaggle.com/api/v1/datasets/download/testtech4biz/biotic-and-abiotic-factors-of-tomato-disease?dataset_version_number=1...


100%|██████████| 40.1k/40.1k [00:00<00:00, 45.2MB/s]

Extracting files...
  • testtech4biz/biotic-and-abiotic-factors-of-tomato-disease → /root/.cache/kagglehub/datasets/testtech4biz/biotic-and-abiotic-factors-of-tomato-disease/versions/1





Downloading from https://www.kaggle.com/api/v1/datasets/download/dhamur/cotton-plant-disease?dataset_version_number=14...


100%|██████████| 3.98G/3.98G [00:37<00:00, 115MB/s]

Extracting files...





  • dhamur/cotton-plant-disease → /root/.cache/kagglehub/datasets/dhamur/cotton-plant-disease/versions/14

Scanning downloaded datasets for matching class folders...
  • 100 img(s) → Stem_Rot from rot/stem_rot
  • 91 img(s) → Stem_Rot from rot/sheath_rot
  • 100 img(s) → Stem_Rot from rot/stem_rot
  • 91 img(s) → Stem_Rot from rot/sheath_rot
  • 98 img(s) → Blight from rot/sheath_blight
  • 97 img(s) → Blight from rot/bacterial_leaf_blight
  • 98 img(s) → Blight from rot/sheath_blight
  • 140 img(s) → Blight from rot/bacterial_leaf_blight
  • 100 img(s) → Healthy from rot/healthy_rice_plant
  • 140 img(s) → Healthy from rot/healthy_rice_plant
  • 576 img(s) → Stem_Rust from rust/Black Rust
  • 1271 img(s) → Stem_Rust from rust/Brown Rust
  • 1301 img(s) → Stem_Rust from rust/Yellow Rust
  • 50 img(s) → Stem_Rust from rust/brown_rust_test
  • 50 img(s) → Stem_Rust from rust/yellow_rust_test
  • 50 img(s) → Stem_Rust from rust/black_rust_test
  • 20 img(s) → Stem_Rust from rust/black_rus

Saving Stem_Rust -> Stem_Rust: 100%|██████████| 2686/2686 [03:58<00:00, 11.24img/s]
Saving Stem_Rust -> Stem_Rust: 100%|██████████| 672/672 [00:57<00:00, 11.59img/s]
Saving Stem_Rot -> Stem_Rot: 100%|██████████| 1528/1528 [01:08<00:00, 22.20img/s]
Saving Stem_Rot -> Stem_Rot: 100%|██████████| 383/383 [00:16<00:00, 23.06img/s]
Saving Stem_Canker -> Stem_Canker: 100%|██████████| 82/82 [00:00<00:00, 128.46img/s]
Saving Stem_Canker -> Stem_Canker: 100%|██████████| 21/21 [00:00<00:00, 137.98img/s]


  ! No temp folder for class Wilt


Saving Blight -> Blight: 100%|██████████| 2964/2964 [01:02<00:00, 47.23img/s]
Saving Blight -> Blight: 100%|██████████| 742/742 [00:14<00:00, 50.26img/s]
Saving Healthy -> Healthy: 100%|██████████| 2352/2352 [02:06<00:00, 18.53img/s]
Saving Healthy -> Healthy: 100%|██████████| 588/588 [00:34<00:00, 17.08img/s]



Done. Dataset folders at:
 Train: /content/data/STEMIFY_Dataset/Train
 Test : /content/data/STEMIFY_Dataset/Test
  Blight train: 2964 images
  Healthy train: 2352 images
  Stem_Canker train: 82 images
  Stem_Rot train: 1528 images
  Stem_Rust train: 2686 images
  Blight test: 742 images
  Healthy test: 588 images
  Stem_Canker test: 21 images
  Stem_Rot test: 383 images
  Stem_Rust test: 672 images
