<a href="https://colab.research.google.com/github/mdhornstein/computer-vision-interpretability-paleontology-v0/blob/main/notebooks/00_prepare_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prepare Fossil Image Dataset (FID) for classification experiments

In [1]:
import os
import random
import shutil
import requests
import pandas as pd
from tqdm.notebook import tqdm
from pathlib import Path

In [2]:
# --- Setup: clone repo if running in Colab ---

# Change this to your repo
REPO_URL = "https://github.com/mdhornstein/computer-vision-interpretability-paleontology-v0.git"
REPO_NAME = REPO_URL.split("/")[-1].replace(".git", "")

if "google.colab" in str(get_ipython()):
    if not os.path.exists(REPO_NAME):
        !git clone {REPO_URL}
    %cd {REPO_NAME}/notebook
    !git pull
    !pwd

Cloning into 'computer-vision-interpretability-paleontology-v0'...
remote: Enumerating objects: 10, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 10 (delta 1), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (10/10), done.
Resolving deltas: 100% (1/1), done.
/content/computer-vision-interpretability-paleontology-v0
Already up to date.
/content/computer-vision-interpretability-paleontology-v0


In [3]:
# --- Config ---
# Which fossil groups to include? Adjust as needed
CLASSES = ["Trilobita", "Ammonoidea"]
DATA_DIR = "../data"   # relative to repo root
VAL_SPLIT = 0.2
SEED = 42

In [4]:
# Download and extract the dataset from Zenodo ---
!wget -O reduced-FID.zip "https://zenodo.org/records/6333970/files/reduced-FID.zip?download=1"

--2025-09-24 09:13:39--  https://zenodo.org/records/6333970/files/reduced-FID.zip?download=1
Resolving zenodo.org (zenodo.org)... 188.185.45.92, 188.185.48.194, 188.185.43.25, ...
Connecting to zenodo.org (zenodo.org)|188.185.45.92|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1819654497 (1.7G) [application/octet-stream]
Saving to: ‘reduced-FID.zip’


2025-09-24 09:49:01 (838 KB/s) - ‘reduced-FID.zip’ saved [1819654497/1819654497]



In [9]:
!unzip -o reduced-FID.zip > fid_out.log 2> fid_err.log

In [None]:
!rm reduced-FID.zip

In [10]:
!ls reduced-FID

 agnatha	  conodont	   marine_reptile   sauropodomorph
 ammonoid	  coral		   myriapod	    shark_teeth
 amphibian	  crinoid	   nautiloid	    snake
 angiosperm	  crocodylomorph   ophiuroid	    sponge
 avialae	  crustacean	   ornithischian    spore_or_pollen
 belemnite	  echinoid	   osteichthyes     starfish
 bivalve	  foraminifer	   ostracod	    stromatolite
 blastoid	  gastropod	   petrified_wood  'Terms of access.docx'
 bone_fragment	  graptolite	   placoderms	    theropod
 brachiopod	  gymnosperm	   pteridophyte     trace_fossil
 bryozoan	  insect	   pterosaurs	    trilobite
 chelicerate	  mammal	   radiolarian	    turtle
 chondrichthyes   mammal_teeth	   reptile_teeth


In [11]:
# List the first 10 files in the 'agnatha' directory of the extracted dataset
!ls reduced-FID/agnatha/ | head

045.jpg
061025.lamprey2-200.jpg
0D4AAOSwnBRgNsn~s-l1600.jpg
100.tumblr_msal0qg4oo1sh1ns2o2_500 (2).jpg
102.pict0155.jpg
102.tully_monster.jpg
102.tumblr_mn5i3ng5ss1spmwbxo1_400.jpg
103.48320371757_f14a613401_m.jpg
106.img_20200811_170136.jpg.ffaacadf2fa3b4a173b996acbbd58163.jpg
107.acd494bf0b97b6706cf53918be87cb2e.jpg


In [17]:
# --- Step 2: Prepare dataset for classification ---
# This cell splits the dataset into training and validation sets
# for the specified classes and organizes them into a new directory structure.

# Paths
SOURCE_DIR = Path("reduced-FID")

experiment_name = "exp1"
TARGET_DIR = Path(DATA_DIR) / experiment_name

# Clean up any old data dir if rerunning
if TARGET_DIR.exists():
    shutil.rmtree(TARGET_DIR)

# Create train/val dirs
for split in ["train", "val"]:
    (TARGET_DIR / split).mkdir(parents=True, exist_ok=True)

# Split ratio
train_ratio = 0.8

# Go through each clade directory
for clade_dir in SOURCE_DIR.iterdir():
    if clade_dir.is_dir():
        clade_name = clade_dir.name
        # Only process specified classes
        if clade_name in CLASSES:
            images = list(clade_dir.glob("*"))
            random.shuffle(images)

            split_idx = int(len(images) * train_ratio)
            train_imgs = images[:split_idx]
            val_imgs = images[split_idx:]

            # Create clade subdirs
            (TARGET_DIR / "train" / clade_name).mkdir(parents=True, exist_ok=True)
            (TARGET_DIR / "val" / clade_name).mkdir(parents=True, exist_ok=True)

            # Copy files
            for img in train_imgs:
                shutil.copy(img, TARGET_DIR / "train" / clade_name / img.name)
            for img in val_imgs:
                shutil.copy(img, TARGET_DIR / "val" / clade_name / img.name)

print("✅ Dataset prepared at:", TARGET_DIR.resolve())

✅ Dataset prepared at: /content/computer-vision-interpretability-paleontology-v0/data/exp1


/content
