In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
base_path = '/content/drive/MyDrive/tiger_datasets'

In [None]:
!pip install icrawler

import os
import shutil
from icrawler.builtin import BingImageCrawler

#Search terms per tiger subspecies
search_map = {
    "bengal_tiger": [
        "Bengal tiger in the wild", "Royal Bengal tiger", "Bengal tiger jungle",
        "Bengal tiger India", "Bengal tiger reserve", "Bengal tiger habitat",
        "Bengal tiger close up", "Bengal tiger face"
    ],
    "siberian_tiger": [
        "Siberian tiger snow", "Amur tiger in forest", "Siberian tiger snow habitat",
        "Amur tiger close up", "Siberian tiger walking in snow", "Siberian tiger wildlife", "Siberian tiger hunting"
    ],
    "south_china_tiger": [
        "South China tiger", "rare South China tiger", "South China tiger zoo",
        "South China tiger endangered", "South China tiger nature",
        "South China tiger resting", "South China tiger photos"
    ],
    "sumatran_tiger": [
        "Sumatran tiger rainforest", "wild Sumatran tiger", "Sumatran tiger jungle",
        "Sumatran tiger close up", "Sumatran tiger Indonesia", "Sumatran tiger walking", "Sumatran tiger conservation"
    ],
    "indochinese_tiger": [
        "Indochinese tiger jungle", "Indochinese tiger", "Indochinese tiger forest",
        "Indochinese tiger Thailand", "Indochinese tiger prey", "Indochinese tiger photos", "Indochinese tiger resting"
    ],
    "malayan_tiger": [
        "Malayan tiger Malaysia", "Harimau Malaya", "Malayan tiger forest",
        "Harimau Malaya wildlife", "Malayan tiger face",
        "Malayan tiger national park", "Malayan tiger Malaysia jungle"
    ]
}

#Save location
base_path = '/content/drive/MyDrive/tiger_datasets'
images_per_term = 200

#Main download loop (no deduplication)
for subspecies, terms in search_map.items():
    folder_name = subspecies.lower().replace(" ", "_")
    save_dir = os.path.join(base_path, folder_name)
    os.makedirs(save_dir, exist_ok=True)
    print(f"\n Saving to: {save_dir}")

    for term in terms:
        print(f"Downloading {images_per_term} for: {term}")
        temp_dir = os.path.join(save_dir, term.replace(" ", "_"))
        os.makedirs(temp_dir, exist_ok=True)

        crawler = BingImageCrawler(storage={'root_dir': temp_dir})
        crawler.crawl(keyword=term, max_num=images_per_term)

        for fname in os.listdir(temp_dir):
            src_path = os.path.join(temp_dir, fname)
            if not os.path.isfile(src_path):
                continue

            try:
                base_name = f"{term.replace(' ', '_')}_{fname}"
                dst_path = os.path.join(save_dir, base_name)
                name, ext = os.path.splitext(base_name)
                count = 1
                while os.path.exists(dst_path):
                    dst_path = os.path.join(save_dir, f"{name}_{count}{ext}")
                    count += 1

                shutil.move(src_path, dst_path)

            except Exception as e:
                print(f"Error with {fname}: {e}")
                if os.path.exists(src_path):
                    os.remove(src_path)

        shutil.rmtree(temp_dir, ignore_errors=True)

print("\n All tiger images downloaded.")


In [None]:
!pip install imagehash

In [None]:
import os
from PIL import Image
import imagehash
from collections import defaultdict
import sys

def clean_and_standardize_images(input_dir, output_dir, target_size=(224, 224)):
    os.makedirs(output_dir, exist_ok=True)
    hash_dict = defaultdict(list)
    processed = 0
    skipped = 0

    for filename in os.listdir(input_dir):
        file_path = os.path.join(input_dir, filename)

        if not filename.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp', '.jfif')):
            print(f"Skipping non-image: {filename}")
            skipped += 1
            continue

        try:
            img = Image.open(file_path)
            if img.mode != 'RGB':
                img = img.convert('RGB')
            img = img.resize(target_size, Image.Resampling.LANCZOS)

            img_hash = str(imagehash.average_hash(img))
            if img_hash in hash_dict:
                print(f"Duplicate: {filename} matches {hash_dict[img_hash]}")
                skipped += 1
                continue

            hash_dict[img_hash].append(filename)
            save_path = os.path.join(output_dir, f"processed_{os.path.splitext(filename)[0]}.jpg")
            img.save(save_path, 'JPEG', quality=95)
            processed += 1

        except Exception as e:
            print(f"Error processing {filename}: {e}")
            skipped += 1

    return processed, skipped

def verify_dataset(directory):
    corrupted = []

    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        try:
            with Image.open(file_path) as img:
                img.verify()
                if img.size != (224, 224):
                    corrupted.append(f"{filename} - Wrong size: {img.size}")
        except:
            corrupted.append(f"{filename} - Corrupted file")

    return corrupted

def process_all_subspecies(base_input_dir, base_output_dir):
    if not os.path.exists(base_input_dir):
        print(f" Input directory '{base_input_dir}' does not exist.")
        print("Please make sure your raw tiger dataset is available in that folder.")
        sys.exit(1)

    subspecies_dirs = [d for d in os.listdir(base_input_dir) if os.path.isdir(os.path.join(base_input_dir, d))]
    total_processed = 0
    total_skipped = 0

    print(f"\n Found {len(subspecies_dirs)} tiger subspecies to process...")

    for subspecies in subspecies_dirs:
        print(f"\n Processing: {subspecies}")
        input_dir = os.path.join(base_input_dir, subspecies)
        output_dir = os.path.join(base_output_dir, subspecies)

        processed, skipped = clean_and_standardize_images(input_dir, output_dir)
        total_processed += processed
        total_skipped += skipped

        print(f"{subspecies} done — Processed: {processed}, Skipped: {skipped}")

        corrupted = verify_dataset(output_dir)
        if corrupted:
            print(f"Corrupted or invalid images in {subspecies}:")
            for item in corrupted:
                print(f"  - {item}")
        else:
            print(f"All images in {subspecies} verified OK!")

    return total_processed, total_skipped

# Define dataset directories
base_input_directory = "/content/drive/MyDrive/tiger_datasets"            # Raw downloaded images
base_output_directory = "/content/drive/MyDrive/tiger_datasets_cleaned"   # Cleaned and standardized images

if __name__ == "__main__":
    total_processed, total_skipped = process_all_subspecies(base_input_directory, base_output_directory)

    print("\n All processing complete!")
    print(f"Total images processed: {total_processed}")
    print(f"Total images skipped: {total_skipped}")


In [None]:
!unzip "/content/drive/MyDrive/tiger_datasets_cleaned.zip" -d "/content/drive/MyDrive/tiger_datasets_cleaned"

Archive:  /content/drive/MyDrive/tiger_datasets_cleaned.zip
   creating: /content/drive/MyDrive/tiger_datasets_cleaned/tiger_datasets_cleaned/
   creating: /content/drive/MyDrive/tiger_datasets_cleaned/tiger_datasets_cleaned/bengal_tiger/
  inflating: /content/drive/MyDrive/tiger_datasets_cleaned/tiger_datasets_cleaned/bengal_tiger/bengal_tiger_001.jpg  
  inflating: /content/drive/MyDrive/tiger_datasets_cleaned/tiger_datasets_cleaned/bengal_tiger/bengal_tiger_002.jpg  
  inflating: /content/drive/MyDrive/tiger_datasets_cleaned/tiger_datasets_cleaned/bengal_tiger/bengal_tiger_003.jpg  
  inflating: /content/drive/MyDrive/tiger_datasets_cleaned/tiger_datasets_cleaned/bengal_tiger/bengal_tiger_004.jpg  
  inflating: /content/drive/MyDrive/tiger_datasets_cleaned/tiger_datasets_cleaned/bengal_tiger/bengal_tiger_005.jpg  
  inflating: /content/drive/MyDrive/tiger_datasets_cleaned/tiger_datasets_cleaned/bengal_tiger/bengal_tiger_006.jpg  
  inflating: /content/drive/MyDrive/tiger_datasets_cl

In [None]:
import os

# Your base folder in Google Drive
base_path = '/content/drive/MyDrive/tiger_datasets_cleaned/tiger_datasets_cleaned'

# Optional: valid image extensions to include
valid_exts = ('.jpg', '.jpeg', '.png', '.webp')

print("Image Count per Subspecies:\n")

for folder in os.listdir(base_path):
    folder_path = os.path.join(base_path, folder)
    if os.path.isdir(folder_path):
        num_images = len([f for f in os.listdir(folder_path) if f.lower().endswith(valid_exts)])
        print(f"{folder}: {num_images} images")

Image Count per Subspecies:

south_china_tiger: 455 images
sumatran_tiger: 796 images
malayan_tiger: 604 images
siberian_tiger: 442 images
bengal_tiger: 580 images
indochinese_tiger: 615 images


In [None]:
import os

#  Path to the dataset with subspecies folders
base_dir = '/content/drive/MyDrive/tiger_datasets_cleaned/tiger_datasets_cleaned'  # or 'tiger_datasets' if unprocessed

#  Loop through each subspecies folder
for subspecies in os.listdir(base_dir):
    folder_path = os.path.join(base_dir, subspecies)

    # Make sure it's a directory
    if not os.path.isdir(folder_path):
        continue

    print(f"\n Renaming images in: {subspecies}")

    # Subspecies name becomes the prefix
    prefix = subspecies.lower().replace(" ", "_")

    # Get and sort all image files
    image_files = sorted([
        f for f in os.listdir(folder_path)
        if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.webp'))
    ])

    # Rename each image using the format: subspecies_###.ext
    for idx, filename in enumerate(image_files, start=1):
        ext = os.path.splitext(filename)[1].lower()
        new_name = f"{prefix}_{idx:03d}{ext}"  # e.g., siberian_tiger_001.jpg
        src = os.path.join(folder_path, filename)
        dst = os.path.join(folder_path, new_name)

        try:
            os.rename(src, dst)
        except Exception as e:
            print(f"Failed to rename {filename} → {new_name}: {e}")

print("\n All renaming complete.")


 Renaming images in: bengal_tiger

 Renaming images in: indochinese_tiger

 Renaming images in: malayan_tiger

 Renaming images in: siberian_tiger

 Renaming images in: south_china_tiger

 Renaming images in: sumatran_tiger

 All renaming complete.


In [None]:
import os
import random
import shutil

# Set seed for reproducibility
random.seed(42)

# Path to the cleaned and renamed dataset
base_dir = '/content/drive/MyDrive/tiger_datasets_cleaned/tiger_datasets_cleaned'
split_base_dir = '/content/drive/MyDrive/tiger_datasets'

# Split ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Loop through each subspecies
for subspecies in os.listdir(base_dir):
    folder_path = os.path.join(base_dir, subspecies)

    if not os.path.isdir(folder_path):
        continue

    print(f"\n Processing: {subspecies}")

    prefix = subspecies.lower().replace(" ", "_")

    image_files = sorted([
        f for f in os.listdir(folder_path)
        if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.webp'))
    ])

    # Step 1: Rename images
    for idx, filename in enumerate(image_files, start=1):
        ext = os.path.splitext(filename)[1].lower()
        new_name = f"{prefix}_{idx:03d}{ext}"
        src = os.path.join(folder_path, filename)
        dst = os.path.join(folder_path, new_name)
        try:
            os.rename(src, dst)
        except Exception as e:
            print(f"Failed to rename {filename} → {new_name}: {e}")

    # Step 2: Reload the renamed list and shuffle
    all_files = sorted([
        f for f in os.listdir(folder_path)
        if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.webp'))
    ])
    random.shuffle(all_files)

    n = len(all_files)
    n_train = int(train_ratio * n)
    n_val = int(val_ratio * n)

    splits = {
        "train": all_files[:n_train],
        "val": all_files[n_train:n_train + n_val],
        "test": all_files[n_train + n_val:]
    }

    # Step 3: Copy to new split directories
    for split, files in splits.items():
        split_dir = os.path.join(split_base_dir, split, subspecies)
        os.makedirs(split_dir, exist_ok=True)

        for fname in files:
            src_path = os.path.join(folder_path, fname)
            dst_path = os.path.join(split_dir, fname)
            shutil.copy2(src_path, dst_path)

        print(f" {split.upper()} set: {len(files)} images")

print("\n🎉 All subspecies split into train/val/test.")



 Processing: south_china_tiger
 TRAIN set: 318 images
 VAL set: 68 images
 TEST set: 69 images

 Processing: sumatran_tiger
 TRAIN set: 557 images
 VAL set: 119 images
 TEST set: 120 images

 Processing: malayan_tiger
 TRAIN set: 422 images
 VAL set: 90 images
 TEST set: 92 images

 Processing: siberian_tiger
 TRAIN set: 309 images
 VAL set: 66 images
 TEST set: 67 images

 Processing: bengal_tiger
 TRAIN set: 406 images
 VAL set: 87 images
 TEST set: 87 images

 Processing: indochinese_tiger
 TRAIN set: 430 images
 VAL set: 92 images
 TEST set: 93 images

🎉 All subspecies split into train/val/test.
