# Prepare Dataset
In this notebook, how can prepare the dataset and decide which data to provide to the students and how much data to provide.

In [32]:
# Hyperparamters to control the dataset size

# Random, stratified sampled from all the available animals
# To provide to students as a starting point
AMOUNT_OF_EXAMPLES_PER_CLASS = 10
AMOUNT_OF_VALIDATION_DATA = 10

# Test Data to be kept private, not to be provided to students
AMOUNT_IN_TEST_DATA = 10
BASE_PATH = "../Data/raw-img"

ANIMALS =[
    "dog",
    "horse",
    "chicken",
    "cat",
    "cow",
    #"sheep",
    #"squirrel",
    #"spider",
    # "elephant",
    #"butterfly",
]

In [33]:
import os
import random
import shutil
from typing import Sequence, List
random.seed(42)

In [34]:
def sample_and_move_images(
    animals: List[str],
    base_path: str,
    destination: str,
    images_per_class: int,
    subfolder:bool,
    extensions: Sequence[str] = ("jpg", "jpeg", "png"),
) -> None:
    """
    For each folder name in `animals`, randomly select up to `images_per_class`
    image files (with the given extensions) from `base_path/<folder>` and move
    them into `test_path`, renaming them as <folder>_0.jpg, <folder>_1.png, etc.

    Args:
        animals: List of folder names under base_path.
        base_path: Path where the animal subfolders live.
        test_path: Path where sampled images will be moved to. Must exist.
        images_per_class: Max number of images to sample per folder.
        extensions: Tuple of allowed file extensions (without leading dot).
    """
    # Ensure destination exists

    os.makedirs(destination, exist_ok=True)


    for animal in animals:
        source_path = os.path.join(base_path, animal)
        os.makedirs(test_path, exist_ok=True)
        if not os.path.isdir(source_path):
            print(f"Warning: {source_path} is not a directory; skipping.")
            continue

        # Gather eligible images
        images = [
            fname for fname in os.listdir(source_path)
            if not fname.startswith(".")
            and fname.lower().split(".")[-1] in extensions
        ]

        counter = 0
        while counter < images_per_class and images:
            image = random.choice(images)
            ext = image.split('.')[-1]
            new_name = f"{animal}_{counter}.{ext}"
            src = os.path.join(source_path, image)
            # If subfolder is True, create a subfolder for the animal
            if subfolder:
                destination_path = os.path.join(destination, animal)
                os.makedirs(destination_path, exist_ok=True)
            else:
                destination_path = destination
            dst = os.path.join(destination_path, new_name)

            shutil.move(src, dst)
            print(f"Moved {image!r} → {new_name!r}")

            # Remove from list so we don't pick it again
            images.remove(image)
            counter += 1

## Create TEST Dataset

In [35]:
test_path ="..//Data/Test"

sample_and_move_images(
    animals=ANIMALS,
    base_path=BASE_PATH,
    destination = test_path,
    images_per_class=AMOUNT_IN_TEST_DATA // len(ANIMALS),
    subfolder=False,
)

Moved 'OIP-s6RuEQ5WQQAh9iINw_vCKQHaFj.jpeg' → 'dog_0.jpeg'
Moved 'OIP-aD_7tN6EN8qfm9-faP0nbwHaE6.jpeg' → 'dog_1.jpeg'
Moved 'OIP-uxBktynkw_ukhvOnAejv8QHaFj.jpeg' → 'horse_0.jpeg'
Moved 'OIP-40XQTcNoQ1lH32aB59RFvgHaE7.jpeg' → 'horse_1.jpeg'
Moved 'OIP-uXM1HPppDRdmKZX3EYdkygHaFj.jpeg' → 'chicken_0.jpeg'
Moved '145.jpeg' → 'chicken_1.jpeg'
Moved '805.jpeg' → 'cat_0.jpeg'
Moved '611.jpeg' → 'cat_1.jpeg'
Moved 'OIP-4Zc7xV1V9dEJrUzVSq7JvQHaIk.jpeg' → 'cow_0.jpeg'
Moved 'OIP-f1R9xCcC4M_hYpc8L7MghAHaFj.jpeg' → 'cow_1.jpeg'


## Create VALIDATION Dataset

In [36]:
valid_path ="..//Data/Valid"
sample_and_move_images(
    animals=ANIMALS,
    base_path=BASE_PATH,
    destination = valid_path,
    images_per_class=AMOUNT_IN_TEST_DATA // len(ANIMALS),
    subfolder=False,
)

Moved 'OIP-UkAzb9xfX1SpwvKb_ZgLZAHaFx.jpeg' → 'dog_0.jpeg'
Moved 'OIP-4bHDx3ENWf1QTaBQj7daeQHaEj.jpeg' → 'dog_1.jpeg'
Moved 'OIP-r-4qYCR_JgaxD7I3zoAgHAHaFj.jpeg' → 'horse_0.jpeg'
Moved 'OIP-vBTeKD_GydkSvocdbYdG8gHaFk.jpeg' → 'horse_1.jpeg'
Moved 'OIP-XQeEQoiBI0tibMSlreS5-gHaE7.jpeg' → 'chicken_0.jpeg'
Moved 'OIP-qh2QBHLMG3phTcdiW9lLcQHaE4.jpeg' → 'chicken_1.jpeg'
Moved '979.jpeg' → 'cat_0.jpeg'
Moved '1198.jpeg' → 'cat_1.jpeg'
Moved 'OIP-7RwhPPot3bDbuJa1xQBU1gHaEK.jpeg' → 'cow_0.jpeg'
Moved 'OIP-fbg6olyclASFKumUwW-bmwHaEs.jpeg' → 'cow_1.jpeg'


## Create TRAIN Dataset

In [37]:
train_path ="..//Data/Train"

sample_and_move_images(
    animals=ANIMALS,
    base_path=BASE_PATH,
    destination = train_path,
    images_per_class=AMOUNT_IN_TEST_DATA // len(ANIMALS),
    subfolder=True,
)

Moved 'OIP-DEhqfvk3r-UQ2oz9-_i0jQHaHT.jpeg' → 'dog_0.jpeg'
Moved 'OIP-UCflrJiqkoD00zNEKbHsIgHaGh.jpeg' → 'dog_1.jpeg'
Moved 'OIP-jYyfJL5-pGI20j_11QfeYQHaF1.jpeg' → 'horse_0.jpeg'
Moved 'OIP-hf1qPQr7IqjbRI63b8YZuQHaFC.jpeg' → 'horse_1.jpeg'
Moved '1026.jpeg' → 'chicken_0.jpeg'
Moved '969.jpeg' → 'chicken_1.jpeg'
Moved '1821.jpeg' → 'cat_0.jpeg'
Moved '1725.jpeg' → 'cat_1.jpeg'
Moved 'OIP-BAyeHLQ1evDYMI9cGdAr3gAAAA.jpeg' → 'cow_0.jpeg'
Moved 'OIP-gHzcYfh3aJLrXaKtuDEjpQHaGa.jpeg' → 'cow_1.jpeg'
