# Herpeton Project


## Dataset Information
- **Source**: [BGLab/BioTrove-Train on Hugging Face](https://huggingface.co/datasets/BGLab/BioTrove-Train)
- **Focus**: Reptilia taxonomic class (snakes, lizards, turtles, etc.)
- **Total Dataset Size**: ~135M samples across 7 taxonomic groups
- **Estimated Reptilia Subset**: ~1.3M labeled reptile images across 189+ species
- **Official Processing**: Uses `arbor_process` library for metadata preprocessing

## Setup Requirements
1. Install required packages
2. Download metadata files from HuggingFace
3. Process using official BioTrove tools
4. Download images and create ML-ready dataset

# Pre-Work

Depending upon how this notebook is run, compatability issues are likely to crop up, so this compatibility cell is here to solve for PyArrow issues encountered when first run.

In [4]:
#pip install -r /content/requirements.txt

Collecting jupyter>=1.0.0 (from -r /content/requirements.txt (line 31))
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
INFO: pip is looking at multiple versions of opencv-python to determine which version is compatible with other requirements. This could take a while.
Collecting opencv-python>=4.5.0 (from -r /content/requirements.txt (line 13))
  Downloading opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting jupyterlab (from jupyter>=1.0.0->-r /content/requirements.txt (line 31))
  Downloading jupyterlab-4.5.0-py3-none-any.whl.metadata (16 kB)
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets>=7.6.0->-r /content/requirements.txt (line 32))
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting async-lru>=1.0.0 (from jupyterlab->jupyter>=1.0.0->-r /content/requirements.txt (line 31))
  Downloading async_lru-2.0.5-py3-none-any.whl.metadata (4.5 kB)
Collecting jupyter-lsp>=2.0.0 (from ju

In [None]:
python quick_start.py

In [5]:
# Install and address any PyArrow compatibility issues
import subprocess
import sys
import importlib

def check_and_fix_pyarrow():
    """Check PyArrow version and fix compatibility issues"""
    print("Checking PyArrow compatibility...")

    try:
        import pyarrow as pa
        print(f"PyArrow version: {pa.__version__}")

        # Test for the extension type attribute
        if hasattr(pa.lib, 'PyExtensionType'):
            print("PyExtensionType is available")
            return True
        else:
            print("PyExtensionType not found - version issue detected")
            return False

    except ImportError:
        print("PyArrow not installed")
        return False

def fix_pyarrow_compatibility():
    """Fix PyArrow version compatibility"""
    print("\nFixing PyArrow compatibility...")

    try:
        # Uninstall and reinstall with specific compatible versions if needed
        print("Uninstalling existing PyArrow...")
        subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", "pyarrow"])

        print("Installing compatible PyArrow version...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "pyarrow>=12.0.0,<15.0.0"])

        # Ensure pandas compatibility
        print("Updating pandas for compatibility...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pandas>=1.5.0"])

        print("PyArrow compatibility fix complete!")
        return True

    except subprocess.CalledProcessError as e:
        print(f"Failed to fix PyArrow: {e}")
        return False

def install_package(package_name, import_name=None):
    """Install a package and verify it can be imported"""
    if import_name is None:
        import_name = package_name

    try:
        # Try to import the package
        importlib.import_module(import_name)
        print(f"{package_name} is already installed")
        return True
    except ImportError:
        print(f"Installing {package_name}...")
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
            print(f"{package_name} installed successfully")
            return True
        except subprocess.CalledProcessError as e:
            print(f"Failed to install {package_name}: {e}")
            return False

print("Setting up BioTrove Reptilia processing environment...")
print("=" * 60)

# Check and fix PyArrow if needed
if not check_and_fix_pyarrow():
    print("\nAttempting to fix PyArrow compatibility...")
    if fix_pyarrow_compatibility():
        # Re-check after fix
        if check_and_fix_pyarrow():
            print("PyArrow issue resolved!")
        else:
            print("WARNING: PyArrow issue persists")
    else:
        print("ERROR: Could not automatically fix PyArrow")

print("\n" + "=" * 60)

# List of required packages (with specific versions to avoid conflicts)
required_packages = [
    ("arbor-process", "arbor_process"),
    ("nest_asyncio", "nest_asyncio"),
    ("datasets>=2.14.0", "datasets"),
    ("pandas>=1.5.0", "pandas"),
    ("numpy", "numpy"),
    ("matplotlib", "matplotlib"),
    ("seaborn", "seaborn"),
    ("pillow", "PIL"),
    ("requests", "requests"),
    ("tqdm", "tqdm")
]

# Install packages
all_installed = True
for package_name, import_name in required_packages:
    if not install_package(package_name, import_name):
        all_installed = False

if all_installed:
    print("\nAll packages installed successfully")
    print("Ready to process BioTrove Reptilia dataset")
else:
    print("\nWARNING: Some packages failed to install")
    print("You may need to install them manually")

print("\nNOTE: If PyArrow errors persist, try:")
print("   pip install --force-reinstall pyarrow==14.0.2")
print("   pip install --force-reinstall pandas==2.0.3")

print("\n" + "=" * 60)

Setting up BioTrove Reptilia processing environment...
Checking PyArrow compatibility...
PyArrow version: 18.1.0
PyExtensionType is available



ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

## Step 1: Extract Reptilia Metadata from BioTrove

Using HuggingFace's efficient filter() method to extract only Reptilia samples from the 135M sample dataset.

In [None]:
def extract_reptilia_efficient():
    """
    Efficiently extract Reptilia samples using HuggingFace datasets with filtering.
    Since Reptilia exists in the dataset but is distributed throughout 135M samples,
    the dataset's built-in filtering capabilities are used.
    """
    from datasets import load_dataset
    from tqdm.auto import tqdm

    print("EFFICIENT REPTILIA EXTRACTION")
    print("="*60)
    print("Using filter() to extract only Reptilia class")


    # Load dataset in streaming mode
    print("Loading BioTrove-Train dataset...")
    dataset = load_dataset('BGLab/BioTrove-Train', streaming=True, split='train')

    # Filter for Reptilia class
    print("Applying Reptilia filter...")
    reptilia_dataset = dataset.filter(lambda example: example['class'] == 'Reptilia')

    print("Extracting Reptilia samples...")
    reptilia_samples = []

    pbar = tqdm(desc="Collecting Reptilia", unit=" samples")

    # Collect samples
    target_samples = 2000  # Collect 2000 Reptilia samples

    for idx, item in enumerate(reptilia_dataset):
        reptilia_samples.append(item)

        if idx < 10:  # Show first 10
            print(f"\n   Sample {idx+1}:")
            print(f"      scientificName: {item.get('scientificName', 'Unknown')}")
            print(f"      family: {item.get('family', 'Unknown')}")
            print(f"      order: {item.get('order', 'Unknown')}")

        pbar.update(1)
        pbar.set_description(f"Collected {len(reptilia_samples)} Reptilia")

        if len(reptilia_samples) >= target_samples:
            print(f"\n   Reached target of {target_samples} samples")
            break

    pbar.close()

    print(f"\n{'='*60}")
    print(f"EXTRACTION COMPLETE:")
    print(f"   Total Reptilia samples collected: {len(reptilia_samples)}")

    if reptilia_samples:
        # Save to parquet chunks
        os.makedirs("biotrove_metadata", exist_ok=True)
        chunk_size = 250
        saved_files = []

        print(f"\n   Saving {len(reptilia_samples)} samples in chunks of {chunk_size}...")

        for i in range(0, len(reptilia_samples), chunk_size):
            chunk_data = reptilia_samples[i:i+chunk_size]
            chunk_df = pd.DataFrame(chunk_data)

            chunk_filename = f"biotrove_metadata/reptilia_chunk_{i//chunk_size}.parquet"
            chunk_df.to_parquet(chunk_filename, index=False)
            saved_files.append(chunk_filename)

            print(f"      Chunk {i//chunk_size}: {len(chunk_data)} samples -> {chunk_filename}")

        print(f"\n   ✓ Saved {len(saved_files)} parquet files")

        # Show species diversity
        species_list = [s.get('scientificName', 'Unknown') for s in reptilia_samples]
        unique_species = set(species_list)
        print(f"   ✓ {len(unique_species)} unique species")

        # Show top families
        families = [s.get('family', 'Unknown') for s in reptilia_samples]
        from collections import Counter
        top_families = Counter(families).most_common(10)
        print(f"\n   Top 10 families:")
        for family, count in top_families:
            print(f"      {family}: {count} samples")

        return {
            'samples': reptilia_samples,
            'saved_files': saved_files,
            'unique_species': len(unique_species)
        }
    else:
        print("   ✗ No Reptilia samples found")
        return None

# Run the efficient extraction
reptilia_data = extract_reptilia_efficient()

## Step 2: Verify Extracted Data

Check the extracted Reptilia samples before processing.

In [3]:
# Verify the extracted Reptilia data
import glob

print("VERIFICATION: Extracted Reptilia Data")
print("="*60)

# Find all reptilia chunk files
chunk_files = sorted(glob.glob("biotrove_metadata/reptilia_chunk_*.parquet"))

print(f"Found {len(chunk_files)} parquet chunk files:")
for file in chunk_files:
    df = pd.read_parquet(file)
    print(f"   {file}: {len(df)} samples")

# Load all chunks
print("\nLoading all Reptilia data...")
all_dfs = [pd.read_parquet(f) for f in chunk_files]
reptilia_df = pd.concat(all_dfs, ignore_index=True)

print(f"\nTOTAL REPTILIA DATASET:")
print(f"   Total samples: {len(reptilia_df):,}")
print(f"   Unique species: {reptilia_df['scientificName'].nunique()}")
print(f"   Unique families: {reptilia_df['family'].nunique()}")
print(f"   Unique orders: {reptilia_df['order'].nunique()}")

print(f"\nCOLUMNS:")
print(f"   {list(reptilia_df.columns)}")

print(f"\nORDER DISTRIBUTION:")
order_counts = reptilia_df['order'].value_counts()
for order, count in order_counts.items():
    print(f"   {order}: {count} samples ({count/len(reptilia_df)*100:.1f}%)")

print(f"\nTOP 15 SPECIES:")
species_counts = reptilia_df['scientificName'].value_counts().head(15)
for species, count in species_counts.items():
    family = reptilia_df[reptilia_df['scientificName'] == species]['family'].iloc[0]
    print(f"   {species:<40} ({family:<20}): {count} samples")

print(f"\nFIRST 3 SAMPLE ROWS:")
print(reptilia_df.head(3).to_string(index=False))

print(f"\n{'='*60}")
print("✓ Reptilia dataset successfully extracted and verified!")
print("✓ Ready for BioTrove processing pipeline")

VERIFICATION: Extracted Reptilia Data
Found 0 parquet chunk files:

Loading all Reptilia data...


NameError: name 'pd' is not defined

## Step 3: Download Images and Create Dataset

Download images from URLs and create ML-ready dataset with image-text pairs.

In [None]:
# Install arbor_process library for BioTrove processing
import subprocess
import sys

try:
    import arbor_process
    print("arbor_process is already installed")
except ImportError:
    print("Installing arbor_process library...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "arbor-process"])
    import arbor_process
    print("Successfully installed arbor_process")

In [None]:
# Download Reptilia images from metadata URLs
import os
import glob
import requests
from PIL import Image
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor, as_completed

print("REPTILIA IMAGE DOWNLOAD")
print("="*60)

# Create output directories
os.makedirs("biotrove_processed", exist_ok=True)
os.makedirs("biotrove_processed/images", exist_ok=True)

# Load all Reptilia chunks
chunk_files = sorted(glob.glob("biotrove_metadata/reptilia_chunk_*.parquet"))
print(f"Loading {len(chunk_files)} Reptilia parquet chunks...")

all_chunks = [pd.read_parquet(f) for f in chunk_files]
reptilia_metadata = pd.concat(all_chunks, ignore_index=True)

print(f"Total Reptilia samples: {len(reptilia_metadata):,}")
print(f"Unique species: {reptilia_metadata['scientificName'].nunique()}")

# Save combined metadata
metadata_path = "biotrove_processed/reptilia_metadata.csv"
reptilia_metadata.to_csv(metadata_path, index=False)
print(f"\nSaved combined metadata to: {metadata_path}")

def download_image(row):
    """Download a single image from URL"""
    photo_id = row['photo_id']
    """Download single image from URL"""

    try:
        response = requests.get(photo_url, timeout=10)
        response.raise_for_status()

        # Open and verify image
        img = Image.open(BytesIO(response.content))

        # Save image
        img_filename = f"biotrove_processed/images/{photo_id}.jpg"
        img.convert('RGB').save(img_filename, 'JPEG', quality=95)

        return {'photo_id': photo_id, 'status': 'success', 'path': img_filename}
    except Exception as e:
        return {'photo_id': photo_id, 'status': 'failed', 'error': str(e)}

# Download images with parallel processing
print("\n" + "="*60)
print("DOWNLOADING IMAGES")
print("="*60)
print(f"Downloading {len(reptilia_metadata)} images with 4 parallel workers...")

successful_downloads = []
failed_downloads = []

with ThreadPoolExecutor(max_workers=4) as executor:
    # Submit all download tasks
    futures = {executor.submit(download_image, row): idx
               for idx, row in reptilia_metadata.iterrows()}

    # Process completed downloads with progress bar
    with tqdm(total=len(futures), desc="Downloading") as pbar:
        for future in as_completed(futures):
            result = future.result()

            if result['status'] == 'success':
                successful_downloads.append(result)
            else:
                failed_downloads.append(result)

            pbar.update(1)
            pbar.set_description(f"Downloaded: {len(successful_downloads)}, Failed: {len(failed_downloads)}")

print(f"\n{'='*60}")
print(f"DOWNLOAD COMPLETE!")
print(f"  Successfully downloaded: {len(successful_downloads)} images")
print(f"  Failed downloads: {len(failed_downloads)} images")
print(f"  Success rate: {len(successful_downloads)/len(reptilia_metadata)*100:.1f}%")

# Create image-text pairs dataset
print("\n" + "="*60)
print("CREATING IMAGE-TEXT PAIRS")
print("="*60)

successful_photo_ids = [d['photo_id'] for d in successful_downloads]
successful_metadata = reptilia_metadata[reptilia_metadata['photo_id'].isin(successful_photo_ids)].copy()

# Add image paths
successful_metadata['image_path'] = successful_metadata['photo_id'].apply(
    lambda x: f"biotrove_processed/images/{x}.jpg"
)

# Create text descriptions
successful_metadata['text_description'] = successful_metadata.apply(
    lambda row: f"{row['scientificName']} ({row['common_name']}) - {row['family']}, Order {row['order']}, Class Reptilia",
    axis=1
)

# Save final dataset
final_dataset_path = "biotrove_processed/reptilia_dataset_final.csv"
successful_metadata.to_csv(final_dataset_path, index=False)

print(f"Created {len(successful_metadata)} image-text pairs")
print(f"Saved to: {final_dataset_path}")

print(f"\n{'='*60}")
print(f"PROCESSING COMPLETE!")
print(f"  Metadata: {metadata_path}")
print(f"  Images: biotrove_processed/images/ ({len(successful_downloads)} files)")
print(f"  Final dataset: {final_dataset_path}")
print(f"  Dataset ready for computer vision training!")

In [None]:
# Explore the processed dataset
import random

print("PROCESSED DATASET EXPLORATION")
print("="*60)

# Check directory structure
print("\nDirectory structure:")
for root, dirs, files in os.walk("biotrove_processed"):
    level = root.replace("biotrove_processed", "").count(os.sep)
    indent = " " * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = " " * 2 * (level + 1)
    for file in files[:5]:  # Show first 5 files
        print(f"{subindent}{file}")
    if len(files) > 5:
        print(f"{subindent}... and {len(files)-5} more files")

# Load processed metadata
if os.path.exists("biotrove_processed/metadata.csv"):
    processed_df = pd.read_csv("biotrove_processed/metadata.csv")

    print(f"\n{'='*60}")
    print(f"PROCESSED DATASET STATISTICS:")
    print(f"  Total samples: {len(processed_df):,}")
    print(f"  Unique species: {processed_df['scientificName'].nunique()}")
    print(f"  Unique families: {processed_df['family'].nunique()}")
    print(f"  Unique orders: {processed_df['order'].nunique()}")

    print(f"\nOrder distribution:")
    for order, count in processed_df['order'].value_counts().items():
        print(f"  {order}: {count} samples ({count/len(processed_df)*100:.1f}%)")

    print(f"\nTop 10 species:")
    for species, count in processed_df['scientificName'].value_counts().head(10).items():
        print(f"  {species}: {count} samples")

    # Show sample images if available
    image_dir = "biotrove_processed/images"
    if os.path.exists(image_dir):
        image_files = [f for f in os.listdir(image_dir) if f.endswith(('.jpg', '.jpeg', '.png'))]
        if image_files:
            print(f"\n{'='*60}")
            print(f"Sample images available: {len(image_files)}")

            # Display a few random samples
            sample_images = random.sample(image_files, min(3, len(image_files)))
            print(f"\nDisplaying {len(sample_images)} random samples...")

            for img_file in sample_images:
                img_path = os.path.join(image_dir, img_file)
                # Get metadata for this image
                img_id = os.path.splitext(img_file)[0]
                if img_id.isdigit():
                    sample_info = processed_df[processed_df['photo_id'] == int(img_id)]
                    if not sample_info.empty:
                        print(f"\n  Image: {img_file}")
                        print(f"    Species: {sample_info.iloc[0]['scientificName']}")
                        print(f"    Family: {sample_info.iloc[0]['family']}")
                        print(f"    Common name: {sample_info.iloc[0].get('common_name', 'N/A')}")

print(f"\n{'='*60}")
print("Dataset ready for computer vision training.")