In [5]:
import os
import requests
from bs4 import BeautifulSoup
import urllib.parse
import time

def download_images(query, num_images, output_dir):
    # Create the directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Set the headers to mimic a browser visit
    headers = {
        "User-Agent": "Safari/537.36"
            }
    
    # Format the query for URL encoding
    query = urllib.parse.quote(query)
    
    downloaded = 0
    page = 0
    
    while downloaded < num_images:
        # Construct the Google Image search URL with pagination
        search_url = f"https://www.google.com/search?q={query}&tbm=isch&start={page*20}"
        
        # Get the HTML content of the search page
        response = requests.get(search_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all image elements
        img_tags = soup.find_all('img')[1:]  # Skip the first image which is usually the Google logo
        
        if not img_tags:
            break
        
        for i, img in enumerate(img_tags):
            if downloaded >= num_images:
                break
            try:
                img_url = img.get('src') or img.get('data-src')
                if not img_url or not img_url.startswith('http'):
                    continue
                img_data = requests.get(img_url).content
                with open(os.path.join(output_dir, f"{query}_{downloaded}.jpg"), 'wb') as handler:
                    handler.write(img_data)
                print(f"Downloaded {query}_{downloaded}.jpg")
                downloaded += 1
            except Exception as e:
                print(f"Could not download image {downloaded}: {e}")
        
        # Increase page count to move to the next set of images
        page += 1
        time.sleep(5)  # Pause to avoid overwhelming the server

# Start the crawling process with error handling
download_images("monarch butterfly", 1000, "data/monarch butterfly")
download_images("swallowtail butterfly", 1000, "data/swallowtail butterfly")
download_images("red admiral butterfly", 1000, "data/red admiral butterfly")
download_images("peacock butterfly", 1000, "data/peacock butterfly")
download_images("painted lady butterfly", 1000, "data/painted lady butterfly")
download_images("blue morpho butterfly", 1000, "data/blue morpho butterfly")
download_images("viceroy butterfly", 1000, "data/viceroy butterfly")
download_images("zebra longwing butterfly", 1000, "data/zebra longwing butterfly")
download_images("buckeye butterfly", 1000, "data/buckeye butterfly")
download_images("skipper butterfly", 1000, "data/skipper butterfly")

Downloaded monarch%20butterfly_0.jpg
Downloaded monarch%20butterfly_1.jpg
Downloaded monarch%20butterfly_2.jpg
Downloaded monarch%20butterfly_3.jpg
Downloaded monarch%20butterfly_4.jpg
Downloaded monarch%20butterfly_5.jpg
Downloaded monarch%20butterfly_6.jpg
Downloaded monarch%20butterfly_7.jpg
Downloaded monarch%20butterfly_8.jpg
Downloaded monarch%20butterfly_9.jpg
Downloaded monarch%20butterfly_10.jpg
Downloaded monarch%20butterfly_11.jpg
Downloaded monarch%20butterfly_12.jpg
Downloaded monarch%20butterfly_13.jpg
Downloaded monarch%20butterfly_14.jpg
Downloaded monarch%20butterfly_15.jpg
Downloaded monarch%20butterfly_16.jpg
Downloaded monarch%20butterfly_17.jpg
Downloaded monarch%20butterfly_18.jpg
Downloaded monarch%20butterfly_19.jpg
Downloaded monarch%20butterfly_20.jpg
Downloaded monarch%20butterfly_21.jpg
Downloaded monarch%20butterfly_22.jpg
Downloaded monarch%20butterfly_23.jpg
Downloaded monarch%20butterfly_24.jpg
Downloaded monarch%20butterfly_25.jpg
Downloaded monarch%20b

In [8]:
import os
import numpy as np
from sklearn.model_selection import train_test_split

# Directory containing all subdirectories of images
base_dir = '/Users/marissasara/data'

# Prepare lists for storing image paths and labels
image_paths = []
labels = []

# Populate the lists
for label_folder in os.listdir(base_dir):
    images_folder_path = os.path.join(base_dir, label_folder)
    if os.path.isdir(images_folder_path):  # Ensure it is a directory
        for image_file in os.listdir(images_folder_path):
            image_paths.append(os.path.join(images_folder_path, image_file))
            labels.append(label_folder)

# Check if the lists are populated
print(f"Total images: {len(image_paths)}")
print(f"Total labels: {len(labels)}")

# Convert lists to numpy arrays for easier handling
image_paths = np.array(image_paths)
labels = np.array(labels)


Total images: 6650
Total labels: 6650


In [9]:
if len(image_paths) > 0 and len(labels) > 0:
    # Split data into training and remaining data
    train_paths, test_val_paths, train_labels, test_val_labels = train_test_split(
        image_paths, labels, test_size=0.3, random_state=42, stratify=labels)

    # Split remaining data into testing and validation
    val_paths, test_paths, val_labels, test_labels = train_test_split(
        test_val_paths, test_val_labels, test_size=0.5, random_state=42, stratify=test_val_labels)

    # Function to copy images to designated folder
    def copy_images(image_paths, labels, subset_name):
        for label in np.unique(labels):
            subset_dir = os.path.join(base_dir, subset_name, label)
            os.makedirs(subset_dir, exist_ok=True)
            label_paths = image_paths[labels == label]
            for src_path in label_paths:
                shutil.copy(src_path, subset_dir)

    # Create directories and copy images
    copy_images(train_paths, train_labels, 'train')
    copy_images(val_paths, val_labels, 'val')
    copy_images(test_paths, test_labels, 'test')

    # Verify the splits
    print(f"Training set size: {len(train_paths)}")
    print(f"Validation set size: {len(val_paths)}")
    print(f"Testing set size: {len(test_paths)}")
else:
    print("No images found to split.")


Training set size: 4655
Validation set size: 997
Testing set size: 998
