In [None]:
# pip install icrawler pillow pandas tqdm
import os, time, csv, re
from pathlib import Path
from icrawler.builtin import GoogleImageCrawler
from PIL import Image
from tqdm import tqdm

TAXA = [
    ("Salmo salar", "Atlantic salmon", "Actinopterygii", "Salmoniformes", "Salmonidae", "Salmo"),
    ("Oncorhynchus mykiss", "Rainbow trout", "Actinopterygii", "Salmoniformes", "Salmonidae", "Oncorhynchus"),
    ("Gadus morhua", "Atlantic cod", "Actinopterygii", "Gadiformes", "Gadidae", "Gadus"),
    ("Melanogrammus aeglefinus", "Haddock", "Actinopterygii", "Gadiformes", "Gadidae", "Melanogrammus"),
    ("Scomber scombrus", "Atlantic mackerel", "Actinopterygii", "Scombriformes", "Scombridae", "Scomber"),
    ("Thunnus thynnus", "Atlantic bluefin tuna", "Actinopterygii", "Scombriformes", "Scombridae", "Thunnus"),
    ("Coryphaena hippurus", "Mahi-mahi", "Actinopterygii", "Carangiformes", "Coryphaenidae", "Coryphaena"),
    ("Xiphias gladius", "Swordfish", "Actinopterygii", "Xiphiiformes", "Xiphiidae", "Xiphias"),
    ("Clupea harengus", "Atlantic herring", "Actinopterygii", "Clupeiformes", "Clupeidae", "Clupea"),
    ("Sardina pilchardus", "European pilchard", "Actinopterygii", "Clupeiformes", "Clupeidae", "Sardina"),
    ("Engraulis encrasicolus", "European anchovy", "Actinopterygii", "Clupeiformes", "Engraulidae", "Engraulis"),
    ("Amphiprion ocellaris", "Ocellaris clownfish", "Actinopterygii", "Blenniiformes", "Pomacentridae", "Amphiprion"),
    ("Pomacanthus imperator", "Emperor angelfish", "Actinopterygii", "Acanthuriformes", "Pomacanthidae", "Pomacanthus"),
    ("Pterois volitans", "Red lionfish", "Actinopterygii", "Scorpaeniformes", "Scorpaenidae", "Pterois"),
    ("Zebrasoma flavescens", "Yellow tang", "Actinopterygii", "Acanthuriformes", "Acanthuridae", "Zebrasoma"),
    ("Hippocampus kuda", "Common seahorse", "Actinopterygii", "Syngnathiformes", "Syngnathidae", "Hippocampus"),
    ("Betta splendens", "Siamese fighting fish", "Actinopterygii", "Anabantiformes", "Osphronemidae", "Betta"),
    ("Paracheirodon innesi", "Neon tetra", "Actinopterygii", "Characiformes", "Characidae", "Paracheirodon"),
    ("Carassius auratus", "Goldfish", "Actinopterygii", "Cypriniformes", "Cyprinidae", "Carassius"),
    ("Cyprinus carpio", "Common carp", "Actinopterygii", "Cypriniformes", "Cyprinidae", "Cyprinus"),
    ("Poecilia reticulata", "Guppy", "Actinopterygii", "Cyprinodontiformes", "Poeciliidae", "Poecilia"),
    ("Astatotilapia burtoni", "Burton’s mouthbrooder", "Actinopterygii", "Cichliformes", "Cichlidae", "Astatotilapia"),
    ("Oreochromis niloticus", "Nile tilapia", "Actinopterygii", "Cichliformes", "Cichlidae", "Oreochromis"),
    ("Pterophyllum scalare", "Freshwater angelfish", "Actinopterygii", "Cichliformes", "Cichlidae", "Pterophyllum"),
    ("Micropterus salmoides", "Florida bass", "Actinopterygii", "Centrarchiformes", "Centrarchidae", "Micropterus"),
    ("Lepomis macrochirus", "Bluegill sunfish", "Actinopterygii", "Centrarchiformes", "Centrarchidae", "Lepomis"),
    ("Esox lucius", "Northern pike", "Actinopterygii", "Esociformes", "Esocidae", "Esox"),
    ("Ictalurus punctatus", "Channel catfish", "Actinopterygii", "Siluriformes", "Ictaluridae", "Ictalurus"),
    ("Silurus glanis", "Wels catfish", "Actinopterygii", "Siluriformes", "Siluridae", "Silurus"),
    ("Electrophorus electricus", "Electric eel", "Actinopterygii", "Gymnotiformes", "Gymnotidae", "Electrophorus"),
    ("Arapaima gigas", "Arapaima", "Actinopterygii", "Osteoglossiformes", "Arapaimidae", "Arapaima"),
    ("Osteoglossum bicirrhosum", "Silver arowana", "Actinopterygii", "Osteoglossiformes", "Osteoglossidae", "Osteoglossum"),
    ("Anguilla anguilla", "European eel", "Actinopterygii", "Anguilliformes", "Anguillidae", "Anguilla"),
    ("Muraena helena", "Mediterranean moray", "Actinopterygii", "Anguilliformes", "Muraenidae", "Muraena"),
    ("Lophius piscatorius", "Monkfish", "Actinopterygii", "Lophiiformes", "Lophiidae", "Lophius"),
    ("Hippoglossus hippoglossus", "Atlantic halibut", "Actinopterygii", "Pleuronectiformes", "Pleuronectidae", "Hippoglossus"),
    ("Pleuronectes platessa", "European plaice", "Actinopterygii", "Pleuronectiformes", "Pleuronectidae", "Pleuronectes"),
    ("Sphyraena barracuda", "Great barracuda", "Actinopterygii", "Carangiformes", "Sphyraenidae", "Sphyraena"),
    ("Dicentrarchus labrax", "European seabass", "Actinopterygii", "Acanthuriformes", "Moronidae", "Dicentrarchus"),
    ("Lutjanus campechanus", "Northern red snapper", "Actinopterygii", "Acanthuriformes", "Lutjanidae", "Lutjanus"),
    ("Epinephelus itajara", "Goliath grouper", "Actinopterygii", "Perciformes", "Epinephelidae", "Epinephelus"),
    ("Cheilinus undulatus", "Humphead wrasse", "Actinopterygii", "Labriformes", "Labridae", "Cheilinus"),
    ("Gobius niger", "Black goby", "Actinopterygii", "Gobiiformes", "Gobiidae", "Gobius"),
    ("Carcharodon carcharias", "Great white shark", "Chondrichthyes", "Lamniformes", "Lamnidae", "Carcharodon"),
    ("Galeocerdo cuvier", "Tiger shark", "Chondrichthyes", "Carcharhiniformes", "Carcharhinidae", "Galeocerdo"),
    ("Sphyrna lewini", "Scalloped hammerhead", "Chondrichthyes", "Carcharhiniformes", "Sphyrnidae", "Sphyrna"),
    ("Raja clavata", "Thornback ray", "Chondrichthyes", "Rajiformes", "Rajidae", "Raja"),
    ("Mobula birostris", "Giant manta ray", "Chondrichthyes", "Myliobatiformes", "Mobulidae", "Mobula"),
    ("Takifugu rubripes", "Japanese pufferfish", "Actinopterygii", "Tetraodontiformes", "Tetraodontidae", "Takifugu"),
    ("Diodon hystrix", "Porcupinefish", "Actinopterygii", "Tetraodontiformes", "Diodontidae", "Diodon"),]



# -------- helper functions ----------
def safe_stem(name: str) -> str:
    return re.sub(r"[^A-Za-z0-9_]", "_", name.strip().replace(" ", "_"))

def build_query(binomial, common):
    return f'"{binomial}" OR "{common}" fish'

# -------- prepare folders ----------
root = Path("dataCLIP")
img_root = root / "images"
img_root.mkdir(parents=True, exist_ok=True)

# -------- download images ----------
for (binomial, common, clas, order, family, genus) in tqdm(TAXA, desc="Taxa"):
    out_dir = img_root / safe_stem(binomial)
    out_dir.mkdir(parents=True, exist_ok=True)

    google_crawler = GoogleImageCrawler(storage={"root_dir": str(out_dir)})
    google_crawler.crawl(
        keyword=build_query(binomial, common),
        max_num=25,
        overwrite=False)
    time.sleep(5)

def is_valid_image(p: Path) -> bool:
    try:
        with Image.open(p) as im:
            im.verify()
        with Image.open(p) as im:
            w, h = im.size
        return min(w, h) >= 224
    except Exception:
        return False

kept = 0
for species_dir in img_root.iterdir():
    if not species_dir.is_dir():
        continue
    for fp in list(species_dir.iterdir()):
        if fp.is_file() and fp.suffix.lower() in {".jpg", ".jpeg", ".png", ".webp"}:
            if not is_valid_image(fp):
                try:
                    fp.unlink()
                except Exception:
                    pass
            else:
                kept += 1

print(f"Kept {kept} images after validation.")

In [None]:
import os

# Set your main directory path here
main_dir = "./dataCLIP/images/"

# Define valid image extensions
image_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff", ".webp"}

# Walk through subfolders
for root, dirs, files in os.walk(main_dir):
    # Skip the main folder itself
    if root == main_dir:
        continue
    
    # Count image files in this subfolder
    image_count = sum(
        1 for f in files if os.path.splitext(f.lower())[1] in image_extensions
    )
    
    # Report results
    if image_count == 25:
        print(f"✅ {os.path.basename(root)} has 25 images")
    else:
        print(f"⚠️ {os.path.basename(root)} has {image_count} images (not 25)")

In [None]:
# -------- build metadata.csv with CLIP-friendly captions ----------
manifest_path = root / "metadata.csv"
caption_templates = [
    "A photograph of {common} ({binomial}).",
    "A high quality image of the fish species {binomial}, commonly called {common}.",
    "Underwater photo of {common}, scientific name {binomial}.",
    "{common} in its typical appearance. Scientific name {binomial}."
]

with open(manifest_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["filepath", "text", "binomial", "common_name", "order", "family", "genus"])
    for (binomial, common, order, family, genus) in TAXA:
        species_dir = img_root / safe_stem(binomial)
        if not species_dir.exists():
            continue
        for fp in species_dir.iterdir():
            if fp.is_file() and fp.suffix.lower() in {".jpg", ".jpeg", ".png"}:
                # choose one caption deterministically
                text = caption_templates[hash(fp.name) % len(caption_templates)].format(
                    common=common, binomial=binomial
                )
                writer.writerow([str(fp), text, binomial, common, order, family, genus])

print(f"Wrote manifest: {manifest_path}")