In [2]:
# !pip install duckduckgo_search


# V1. old Script 

In [None]:
import os
import requests
from PIL import Image
from io import BytesIO
from duckduckgo_search import DDGS

# Function to Search Images
def search_images(query, max_images=100):
    urls = []
    with DDGS() as ddgs:
        results = ddgs.images(keywords=query, max_results=max_images)
        for result in results:
            url = result.get("image")
            if url:
                urls.append(url)
    print(f"Found {len(urls)} image URLs for '{query}'")
    return urls

# Car Models You Want
car_models = [
    "Maruti Suzuki Swift", "Maruti Suzuki Baleno", "Mahindra Thar", 
    "Tata Nexon", "Hyundai Creta", "Hyundai i20", "Kia Seltos", 
    "Toyota Fortuner", "Honda City", "BMW X5"
]

# Base Directory to Save
base_dir = "car_dataset/train"
os.makedirs(base_dir, exist_ok=True)

# Function to Download Images
def download_car_images(model_name, max_images=100):
    folder = os.path.join(base_dir, model_name.lower().replace(" ", "_"))
    os.makedirs(folder, exist_ok=True)
    
    urls = search_images(model_name, max_images)
    
    for i, url in enumerate(urls):
        try:
            response = requests.get(url, timeout=10)
            img = Image.open(BytesIO(response.content)).convert("RGB")
            # img = img.resize((256, 256))
            fname = f"{model_name.lower().replace(' ', '_')}_{i}.jpg"
            img.save(os.path.join(folder, fname))
            print(f"Saved image {i+1} for {model_name}")
        except Exception as e:
            print(f"Failed to download image {i+1}: {e}")

# Download Images for All Car Models
for model in car_models:
    print("\n=== Scraping images for:", model)
    download_car_images(model, max_images=100)


# V2.

In [None]:
import os
import requests
import json
from PIL import Image
from io import BytesIO
from duckduckgo_search import DDGS

# Load Custom Queries
with open("car_model_queries.json", "r") as f:
    model_queries = json.load(f)

# Image Search
def search_images(query, max_images=100):
    urls = []
    with DDGS() as ddgs:
        results = ddgs.images(keywords=query, max_results=max_images)
        for result in results:
            url = result.get("image")
            if url:
                urls.append(url)
    return urls

# Save Directory
base_dir = "car_dataset/train"
os.makedirs(base_dir, exist_ok=True)

# Download Function
def download_car_images(model_name, queries, total_images=200):
    folder = os.path.join(base_dir, model_name.lower().replace(" ", "_"))
    os.makedirs(folder, exist_ok=True)

    downloaded = 0
    per_query = max(total_images // len(queries), 1)

    for query in queries:
        urls = search_images(query, max_images=per_query)
        for i, url in enumerate(urls):
            if downloaded >= total_images:
                break
            try:
                response = requests.get(url, timeout=10)
                img = Image.open(BytesIO(response.content)).convert("RGB")
                fname = f"{model_name.lower().replace(' ', '_')}_{downloaded}.jpg"
                img.save(os.path.join(folder, fname))
                downloaded += 1
                print(f"Saved image {downloaded} for {model_name}")
            except Exception as e:
                print(f"Failed to download from {url}: {e}")

# Download for All Models
for model, queries in model_queries.items():
    print(f"\n=== Downloading for: {model}")
    download_car_images(model, queries, total_images=100)


# V3. final working version

In [None]:
import os
import requests
import json
import time
from PIL import Image
from io import BytesIO
from ddgs import DDGS

#  Load Custom Queries 
with open("car_model_queries.json", "r") as f:
    model_queries = json.load(f)

#  Image Search 
def search_images(query, max_images=100, delay=1.0):
    urls = []
    try:
        with DDGS() as ddgs:
            results = ddgs.images(query, max_results=max_images)
            for result in results:
                url = result.get("image")
                if url:
                    urls.append(url)
                time.sleep(delay)  # avoid triggering rate limit
    except Exception as e:
        print(f"[ERROR] Image search failed for query '{query}': {e}")
    return urls

#  Save Directory 
base_dir = "car_dataset/train"
os.makedirs(base_dir, exist_ok=True)

#  Download Function 
def download_car_images(model_name, queries, total_images=300, per_query_limit=100):
    folder = os.path.join(base_dir, model_name.lower().replace(" ", "_"))
    os.makedirs(folder, exist_ok=True)

    # Count already downloaded images
    downloaded = len([f for f in os.listdir(folder) if f.endswith(".jpg")])

    if downloaded >= total_images:
        print(f"⏩ Skipping {model_name} — already has {downloaded} images.")
        return

    per_query = max((total_images - downloaded) // len(queries), 1)

    for query in queries:
        print(f"🔎 Searching for: {query}")
        urls = search_images(query, max_images=min(per_query, per_query_limit))
        print(f"  → Found {len(urls)} image URLs.")

        for url in urls:
            if downloaded >= total_images:
                break
            try:
                response = requests.get(url, timeout=10)
                img = Image.open(BytesIO(response.content)).convert("RGB")

                fname = f"{model_name.lower().replace(' ', '_')}_{downloaded:03d}.jpg"
                img.save(os.path.join(folder, fname))
                downloaded += 1
                print(f"  ✔ Saved image {downloaded} for {model_name}")
            except Exception as e:
                print(f"  ✖ Failed to download image: {e}")
            time.sleep(0.2)

    print(f"✅ Finished downloading for {model_name}: {downloaded} images.\n")


#  Download All Models 
for model, queries in model_queries.items():
    print(f"\n📥 Downloading images for: {model}")
    download_car_images(model, queries, total_images=300)

print("🎉 All downloads completed!")


📥 Downloading images for: Tata Motors Harrier
🔎 Searching for: Tata Motors Harrier front view
  → Found 60 image URLs.
  ✖ Failed to download image: cannot identify image file <_io.BytesIO object at 0x00000212CD26DD50>
  ✖ Failed to download image: cannot identify image file <_io.BytesIO object at 0x00000212CD26E700>
  ✖ Failed to download image: cannot identify image file <_io.BytesIO object at 0x00000212CE642020>
  ✔ Saved image 1 for Tata Motors Harrier
  ✔ Saved image 2 for Tata Motors Harrier
  ✔ Saved image 3 for Tata Motors Harrier
  ✔ Saved image 4 for Tata Motors Harrier
  ✔ Saved image 5 for Tata Motors Harrier
  ✔ Saved image 6 for Tata Motors Harrier
  ✔ Saved image 7 for Tata Motors Harrier
  ✔ Saved image 8 for Tata Motors Harrier
  ✔ Saved image 9 for Tata Motors Harrier
  ✔ Saved image 10 for Tata Motors Harrier
  ✔ Saved image 11 for Tata Motors Harrier
  ✔ Saved image 12 for Tata Motors Harrier
  ✔ Saved image 13 for Tata Motors Harrier
  ✔ Saved image 14 for Tata Mo



  ✔ Saved image 42 for Tata Motors Harrier
  ✔ Saved image 43 for Tata Motors Harrier
  ✔ Saved image 44 for Tata Motors Harrier
  ✔ Saved image 45 for Tata Motors Harrier
  ✔ Saved image 46 for Tata Motors Harrier
  ✔ Saved image 47 for Tata Motors Harrier
  ✔ Saved image 48 for Tata Motors Harrier
  ✔ Saved image 49 for Tata Motors Harrier
  ✔ Saved image 50 for Tata Motors Harrier
  ✖ Failed to download image: cannot identify image file <_io.BytesIO object at 0x00000212CE643600>
  ✔ Saved image 51 for Tata Motors Harrier
  ✔ Saved image 52 for Tata Motors Harrier
  ✔ Saved image 53 for Tata Motors Harrier
🔎 Searching for: Tata Motors Harrier side view
  → Found 60 image URLs.
  ✔ Saved image 54 for Tata Motors Harrier
  ✔ Saved image 55 for Tata Motors Harrier
  ✔ Saved image 56 for Tata Motors Harrier
  ✔ Saved image 57 for Tata Motors Harrier
  ✔ Saved image 58 for Tata Motors Harrier
  ✔ Saved image 59 for Tata Motors Harrier
  ✔ Saved image 60 for Tata Motors Harrier
  ✔ Saved i