# Prepare Dataset
In this notebook, how can prepare the dataset and decide which data to provide to the students and how much data to provide.

In [42]:
# Hyperparamters to control the dataset size

# Random, stratified sampled from all the available animals
# To provide to students as a starting point
AMOUNT_OF_EXAMPLES_PER_CLASS = 10
AMOUNT_OF_VALIDATION_DATA = 2

# Test Data to be kept private, not to be provided to students
AMOUNT_IN_TEST_DATA = 400
BASE_PATH = "../Data/raw_img"

ANIMALS =[
    "dog",
    "horse",
    "chicken",
    "cat",
    "cow",
    #"sheep",
    #"squirrel",
    #"spider",
    # "elephant",
    #"butterfly",
]

In [34]:
import os
import random
import shutil
from typing import Sequence, List
random.seed(42)

In [35]:
def sample_and_move_images(
    animals: List[str],
    base_path: str,
    destination: str,
    images_per_class: int,
    subfolder:bool,
    extensions: Sequence[str] = ("jpg", "jpeg", "png"),
) -> None:
    """
    For each folder name in `animals`, randomly select up to `images_per_class`
    image files (with the given extensions) from `base_path/<folder>` and move
    them into `test_path`, renaming them as <folder>_0.jpg, <folder>_1.png, etc.

    Args:
        animals: List of folder names under base_path.
        base_path: Path where the animal subfolders live.
        test_path: Path where sampled images will be moved to. Must exist.
        images_per_class: Max number of images to sample per folder.
        extensions: Tuple of allowed file extensions (without leading dot).
    """
    # Ensure destination exists

    os.makedirs(destination, exist_ok=True)


    for animal in animals:
        source_path = os.path.join(base_path, animal)
        os.makedirs(test_path, exist_ok=True)
        #print(f'Sourcepath Correct: {os.path.exists(source_path)}')
        if not os.path.isdir(source_path):
            print(f"Warning: {source_path} is not a directory; skipping.")
            continue

        # Gather eligible images
        images = [
            fname for fname in os.listdir(source_path)
            if not fname.startswith(".")
            and fname.lower().split(".")[-1] in extensions
        ]

        counter = 0
        while counter < images_per_class and images:
            image = random.choice(images)
            ext = image.split('.')[-1]
            new_name = f"{animal}_{counter}.{ext}"
            src = os.path.join(source_path, image)
            # If subfolder is True, create a subfolder for the animal
            if subfolder:
                destination_path = os.path.join(destination, animal)
                os.makedirs(destination_path, exist_ok=True)
            else:
                destination_path = destination
            dst = os.path.join(destination_path, new_name)

            shutil.move(src, dst)
            print(f"Moved {image!r} → {new_name!r}")

            # Remove from list so we don't pick it again
            images.remove(image)
            counter += 1

## Create TEST Dataset

In [36]:
test_path ="..//Data/Test"

sample_and_move_images(
    animals=ANIMALS,
    base_path=BASE_PATH,
    destination = test_path,
    images_per_class=AMOUNT_IN_TEST_DATA // len(ANIMALS),
    subfolder=False,
)

Moved 'OIP-Zuze_I8QQh7TYu0zgndPBwHaFj.jpeg' → 'dog_0.jpeg'
Moved 'OIP-fbVBaid9zoEclR0Up691SAHaKV.jpeg' → 'dog_1.jpeg'
Moved 'OIP-Tx86JrsI9viv15xZNvGgoQHaFj.jpeg' → 'dog_2.jpeg'
Moved 'OIP-PLI0_qtEX1tS6XfgkldplQHaNd.jpeg' → 'dog_3.jpeg'
Moved 'OIP-Fz6kwHTCkGNotjuv2cNtWAHaFj.jpeg' → 'dog_4.jpeg'
Moved 'OIP-zVBD-VpWdRFDASJFpJ1vkwHaFj.jpeg' → 'dog_5.jpeg'
Moved 'OIP-B95cXok1TG-y5TP2HLSRwAHaE8.jpeg' → 'dog_6.jpeg'
Moved 'OIP-TiX_ItcIg1xU4xyE3WFmbgHaHa.jpeg' → 'dog_7.jpeg'
Moved 'OIP-PFeEDHzfg-4J3Xq687vluwHaFB.jpeg' → 'dog_8.jpeg'
Moved 'OIP-OhqPfFexC9VObznCLb4RpQHaFj.jpeg' → 'dog_9.jpeg'
Moved 'OIP-MkQkszPInF3bDH7PskgOeQHaGh.jpeg' → 'dog_10.jpeg'
Moved 'OIP-38XnIrqYcFxWjwra3b1a2wHaJ3.jpeg' → 'dog_11.jpeg'
Moved 'OIP-prCN-8G9qKUQFTqMaT1SPgHaFL.jpeg' → 'dog_12.jpeg'
Moved 'OIP-pXRyU259nBzWN2ajuFEvqAHaF_.jpeg' → 'dog_13.jpeg'
Moved 'OIP-qT8oCpiN1cWaVhcpwf9WgAHaLG.jpeg' → 'dog_14.jpeg'
Moved 'OIP-hGaPLz-vaWlQZWa5ZGM48wHaFj.jpeg' → 'dog_15.jpeg'
Moved 'OIP-atuQV_J4-dv6v7YAVIoK1AHaFj.jpeg' → 'dog

## Create VALIDATION Dataset

In [43]:
valid_path ="..//Data/Valid"
sample_and_move_images(
    animals=ANIMALS,
    base_path=BASE_PATH,
    destination = valid_path,
    images_per_class=AMOUNT_OF_VALIDATION_DATA,
    subfolder=False,
)

Moved 'OIP-SUCOsfhJzq7pEvIi7z4BqgHaEK.jpeg' → 'dog_0.jpeg'
Moved 'OIP-CeQ4sXXlXyPc1-awqaoeywHaFj.jpeg' → 'dog_1.jpeg'
Moved 'OIP-J74k9sRQfvsrqnJnb2C7YwHaE7.jpeg' → 'horse_0.jpeg'
Moved 'OIP-DEhtJ7L-bjfHpsQQjjZp5QHaEK.jpeg' → 'horse_1.jpeg'
Moved 'OIP-AZmJXvW4wlXO9nUV1l2vcQHaFj.jpeg' → 'chicken_0.jpeg'
Moved 'OIP-mW7aKwZL_FhC5qmte8U2wgHaFj.jpeg' → 'chicken_1.jpeg'
Moved '1739.jpeg' → 'cat_0.jpeg'
Moved '1629.jpeg' → 'cat_1.jpeg'
Moved 'OIP-DABxAJ08GuU8PrOeezqI1wHaLE.jpeg' → 'cow_0.jpeg'
Moved 'OIP-BvRaInFuiCAZGfFEMQE7ogAAAA.jpeg' → 'cow_1.jpeg'


## Create TRAIN Dataset

In [44]:
train_path ="..//Data/Train"

sample_and_move_images(
    animals=ANIMALS,
    base_path=BASE_PATH,
    destination = train_path,
    images_per_class=AMOUNT_OF_EXAMPLES_PER_CLASS,
    subfolder=True,
)

Moved 'OIP-ghhGNBPT-ubs5zYQSK8vkwHaI3.jpeg' → 'dog_0.jpeg'
Moved 'OIP-4AQGJw_pRzKLWCRSd7rQAgHaGX.jpeg' → 'dog_1.jpeg'
Moved 'OIP-QGAgX-sM86clDQQu4wpKngHaFj.jpeg' → 'dog_2.jpeg'
Moved 'OIP-7PflA-0CUOe_CqSKgIkzlwHaLH.jpeg' → 'dog_3.jpeg'
Moved 'OIP-FZLeu6wX6jIdlKW-bVyxCAHaFh.jpeg' → 'dog_4.jpeg'
Moved 'OIP-xf5CztduzS9rciZs5qygTwAAAA.jpeg' → 'dog_5.jpeg'
Moved 'OIP-LASkiOTkE7GgyBkjRz8gkQHaFj.jpeg' → 'dog_6.jpeg'
Moved 'OIP-7nE58ZRQG7mWCYYOinOuRgAAAA.jpeg' → 'dog_7.jpeg'
Moved 'OIP-1wq9vP9_aPSM7THOwyo2NwHaHq.jpeg' → 'dog_8.jpeg'
Moved 'OIP-egfvgyzLritcoe4p5yBDBwHaEK.jpeg' → 'dog_9.jpeg'
Moved 'OIP-6_nLf4o02Gxb5OLHFSNHHgHaEn.jpeg' → 'horse_0.jpeg'
Moved 'OIP-TH9NjAh3mju8N8Wk9Hl56AHaEK.jpeg' → 'horse_1.jpeg'
Moved 'OIP-nBPmCSV-qaLaromE8r-IegHaFj.jpeg' → 'horse_2.jpeg'
Moved 'OIP-ILm2BmfW4p5McIZjqwe46QHaFj.jpeg' → 'horse_3.jpeg'
Moved 'OIP-1OxdZ9vRQ6-hNwg094tVlQHaE7.jpeg' → 'horse_4.jpeg'
Moved 'OIP-i-P9skjtd9kWuZrKpDp0dQHaE4.jpeg' → 'horse_5.jpeg'
Moved 'OIP-zwvm9B5IEpwA7LRC45aUsQHaFj.jpeg' 