In [None]:
import os
import requests
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import time

# 全局变量：名人分类和名字名录
celebrity_dict = {
    "Politician": ["George Washington", "Abraham Lincoln", "Donald Trump", "Elizabeth II", "Barack Obama"],
    "Actor": ["Charlie Chaplin", "Natalie Portman", "Leonardo DiCaprio", "Tom Hanks", "Jackie Chan"],
    "Director": ["Alfred Hitchcock", "Steven Spielberg", "Quentin Tarantino", "Martin Scorsese", "James Cameron"],
    "Poet": ["Emily Dickinson", "Li Bai", "William Wordsworth", "Rumi", "Jorge Luis Borges"],
    "Artist": ["Pablo Picasso", "Vincent van Gogh", "Leonardo da Vinci", "Claude Monet", "Michelangelo"],
    "Musician": ["Ludwig van Beethoven", "Wolfgang Amadeus Mozart", "Johann Sebastian Bach", "David Bowie", "Miles Davis"],
    "Athlete": ["Usain Bolt", "Michael Phelps", "Serena Williams", "Cristiano Ronaldo", "Muhammad Ali"],
    "Singer": ["Taylor Swift", "Han Hong", "Elvis Presley", "Freddie Mercury", "Whitney Houston"],
    "Writer": ["William Shakespeare", "Lu Xun", "Mark Twain", "Leo Tolstoy", "Gabriel García Márquez"],
    "Scientist": ["Isaac Newton", "Albert Einstein", "Thomas Edison", "Nikola Tesla", "Tu Youyou"]
}

# 爬取并保存图片
def download_images(query, save_folder, num_images=10):
    search_url = f"https://www.bing.com/images/search?q={query}&FORM=HDRSC2"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"}
    
    response = requests.get(search_url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to get response for {query}")
        return
    
    soup = BeautifulSoup(response.text, "html.parser")
    img_tags = soup.find_all("img", class_="mimg")
    
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    count = 0
    for i, img_tag in enumerate(img_tags):
        if count >= num_images:
            break
        img_url = img_tag.get("src") or img_tag.get("data-src")
        if img_url:
            try:
                img_data = requests.get(img_url, timeout=10).content
                img = Image.open(BytesIO(img_data))
                img_format = img.format if img.format else "jpg"
                img.save(os.path.join(save_folder, f"{query.replace(' ', '_')}_{count+1}.{img_format.lower()}"))
                count += 1
                print(f"Saved: {query} - {count}")
            except Exception as e:
                print(f"Error downloading {img_url}: {e}")
        time.sleep(1)
    print(f"{count}/{num_images} images downloaded for {query}")

# 主函数：遍历名人字典，爬取图片
def main(base_folder="Celebrity_Dataset", images_per_person=10):
    if not os.path.exists(base_folder):
        os.makedirs(base_folder)

    for category, celebrities in celebrity_dict.items():
        category_folder = os.path.join(base_folder, category)
        if not os.path.exists(category_folder):
            os.makedirs(category_folder)
        
        for celeb in celebrities:
            celeb_folder = os.path.join(category_folder, celeb.replace(" ", "_"))
            print(f"Downloading images for: {celeb} in category: {category}")
            download_images(celeb, celeb_folder, images_per_person)
            print("-"*50)

if __name__ == "__main__":
    main()


Downloading images for: George Washington in category: Politician
Saved: George Washington - 1
Saved: George Washington - 2
Saved: George Washington - 3
Saved: George Washington - 4
Saved: George Washington - 5
Error downloading data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAEALAAAAAABAAEAAAIBTAA7: No connection adapters were found for 'data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAEALAAAAAABAAEAAAIBTAA7'
Error downloading data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAEALAAAAAABAAEAAAIBTAA7: No connection adapters were found for 'data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAEALAAAAAABAAEAAAIBTAA7'
Error downloading data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAEALAAAAAABAAEAAAIBTAA7: No connection adapters were found for 'data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAEALAAAAAABAAEAAAIBTAA7'
Error downloading data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAEALAAAAAABAAEAAAIBTAA7: No connection adapters were found for 'd