In [48]:
# run only once
!pip install duckduckgo_search




[notice] A new release of pip available: 22.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


## How to search for images

In [49]:
def search_images_multi(keywords, per_keyword=100, max_total=300, verbose=True):
    """
    Vyhledá obrázky pro více klíčových slov, ale nepřekročí max_total.
    
    keywords: list[str] - seznam klíčových slov
    per_keyword: int - max obrázků na jedno slovo
    max_total: int - maximální celkový počet obrázků
    verbose: bool - pokud True, vypíše průběžné info
    
    return: list[str] - seznam URL obrázků
    """
    all_urls = []
    for word in keywords:
        if len(all_urls) >= max_total:
            break  # už máme dost
        urls = search_images(word, per_keyword)
        
        # kolik ještě můžeme přidat, aby nebyl překročen limit
        space_left = max_total - len(all_urls)
        urls = urls[:space_left]
        
        all_urls.extend(urls)
        
        if verbose:
            print(f"'{word}': přidáno {len(urls)} obrázků (celkem {len(all_urls)})")
    
    # odstraníme duplicity
    all_urls = list(set(all_urls))
    
    if verbose:
        print(f"Celkem {len(all_urls)} unikátních obrázků (limit {max_total})")
    
    return all_urls



**Example:**

Note we got less images than we asked for.

In [50]:
keywords = ["car", "sports car", "sedan", "suv",
        "race car", "truck"]
#"car", "sports car", "sedan", "suv", "race car", "truck"
# "fish" = ["fish", "goldfish", "salmon", "trout","tuna", "catfish", "bass"]
# "flower": "flower", "rose", "sunflower", "tulip","orchid", "daisy", "lotus flower", "lavender flower"
# "bird": "bird", "parrot", "sparrow", "eagle", "owl", "seagull", "robin"
image_urls = search_images_multi(
    keywords, 
    per_keyword=100, 
    max_total=200
)

len(image_urls)



'car': přidáno 100 obrázků (celkem 100)
'sports car': přidáno 100 obrázků (celkem 200)
Celkem 198 unikátních obrázků (limit 200)


198

Let us take a random image and look at it:

In [51]:
image_urls[6]

'https://dailycarblog.com/wp-content/uploads/2014/11/Vauxhall-Corsa-2015.jpg'

## How to download images

In [52]:
import os
import requests
from urllib.parse import urlparse
import warnings

def download_image(url, folder, custom_name=None, verbose=True):
    # Create the folder if it doesn't exist
    os.makedirs(folder, exist_ok=True)

    # Get the filename from the URL or use the custom name
    if custom_name:
        filename = custom_name
    else:
        filename = os.path.basename(urlparse(url).path)
        if not filename:
            filename = 'image.jpg'  # Default filename if none is found in the URL

    # Ensure the filename has an extension
    if not os.path.splitext(filename)[1]:
        filename += '.jpg'

    filepath = os.path.join(folder, filename)

    # If the file already exists, append a number to make it unique
    base, extension = os.path.splitext(filepath)
    counter = 1
    while os.path.exists(filepath):
        filepath = f"{base}_{counter}{extension}"
        counter += 1

    try:
        # Send a GET request to the URL with a timeout of 10 seconds
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raises an HTTPError for bad responses

        # Check if the content type is an image
        content_type = response.headers.get('content-type', '')
        if not content_type.startswith('image'):
            if verbose:
                warnings.warn(f"The URL does not point to an image. Content-Type: {content_type}")
            return False

        # Write the image content to the file
        with open(filepath, 'wb') as f:
            f.write(response.content)

        if verbose:
            print(f"Image successfully downloaded: {filepath}")
        return True

    except requests.exceptions.Timeout:
        if verbose: 
            warnings.warn(f"Download timed out for URL: {url}")
    except requests.exceptions.HTTPError as e:
        if verbose: 
            warnings.warn(f"HTTP error occurred: {e}")
    except requests.exceptions.RequestException as e:
        if verbose: 
            warnings.warn(f"An error occurred while downloading the image: {e}")
    except IOError as e:
        if verbose: 
            warnings.warn(f"An error occurred while writing the file: {e}")

    return False

Let us donwload all cactus into separate folder.

In [53]:
from tqdm.notebook import tqdm

for i, url in enumerate(tqdm(image_urls)):
    download_image(url, "./DataSet/car/", f'image{i}.jpg', verbose=False)

  0%|          | 0/198 [00:00<?, ?it/s]

## How to resize all images to 28x28

In [54]:
# ...
from torchvision import datasets, transforms
# ...

# Data transformations
transform = transforms.Compose([
    transforms.Resize((28, 28)),
    transforms.ToTensor(),
])

# Load the dataset
dataset = datasets.ImageFolder(root='./dataset', transform=transform)

Do not forget the images are RGB, not greyscale as FashionMNIST. Therefore, they will have **three** channels on input, not one.

In [55]:
import torch.nn as nn  # Ensure the nn module is imported

# Define the simple CNN
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        # input channels = 3 (RGB), output channels = 16
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        # ...