In [None]:
#pip install icrawler

In [None]:
from icrawler.builtin import BingImageCrawler, GoogleImageCrawler, BaiduImageCrawler

import os

# List of cattle breeds
breeds = [
    "Holstein cow", "Jersey cow", "Angus cow", "Brahman cow", "Hereford cow",
    "Simmental cow", "Limousin cow", "Guernsey cow", "Charolais cow", "Ayrshire cow"
]

# Directory to save images
base_dir = "cattle_dataset"
os.makedirs(base_dir, exist_ok=True)

# Download ~500 images per breed
for breed in breeds:
    folder_name = breed.replace(" ", "_")
    save_path = os.path.join(base_dir, folder_name)
    os.makedirs(save_path, exist_ok=True)
    total_images_saved = 0  # Initialize total images saved for the breed

    # Bing images
    crawler = BingImageCrawler(storage={'root_dir': save_path})
    crawler.crawl(keyword=breed, max_num=1000)
    print(f"Downloaded images for {breed}")

    # Google images
    google_crawler = GoogleImageCrawler(storage={'root_dir': save_path})
    google_crawler.crawl(keyword=breed, max_num=1000)
    print(f"Downloaded ~1000 images for {breed} from Google")

    # Baidu images
    baidu_crawler = BaiduImageCrawler(storage={'root_dir': save_path})
    baidu_crawler.crawl(keyword=breed, max_num=1000)   
    print(f"Downloaded ~1000 images for {breed} from Baidu")



In [None]:
from PIL import Image
import os

def resize_images(input_folder):
    for folder in os.listdir(input_folder):
        folder_path = os.path.join(input_folder, folder)
        for img_name in os.listdir(folder_path):
            img_path = os.path.join(folder_path, img_name)
            try:
                img = Image.open(img_path)
                img = img.resize((224, 224))
                img.save(img_path)
            except:
                os.remove(img_path)  # remove corrupted images

resize_images("cattle_dataset")


In [None]:
import os
import shutil
from sklearn.model_selection import train_test_split

src_folder = "cattle_dataset"
dst_folder = "dataset"
splits = ['train', 'val', 'test']
split_ratios = [0.7, 0.15, 0.15]  # 70% train, 15% val, 15% test

for breed in os.listdir(src_folder):
    breed_path = os.path.join(src_folder, breed)
    images = os.listdir(breed_path)
    train, temp = train_test_split(images, test_size=0.3, random_state=42)
    val, test = train_test_split(temp, test_size=0.5, random_state=42)

    for split, files in zip(splits, [train, val, test]):
        split_path = os.path.join(dst_folder, split, breed)
        os.makedirs(split_path, exist_ok=True)
        for file in files:
            shutil.copy(os.path.join(breed_path, file), os.path.join(split_path, file))


In [None]:
import json

# Prepare a dictionary to store the dataset structure
dataset_info = {}

for split in splits:
    split_dir = os.path.join(dst_folder, split)
    dataset_info[split] = {}
    for breed in os.listdir(split_dir):
        breed_dir = os.path.join(split_dir, breed)
        images = os.listdir(breed_dir)
        dataset_info[split][breed] = images

# Save the dataset structure to a JSON file
with open(os.path.join(dst_folder, "dataset_info.json"), "w") as f:
    json.dump(dataset_info, f, indent=2)


print("Dataset structure saved to dataset_info.json")

