In [15]:
import requests
import os
from tqdm import tqdm
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin, urlparse

In [16]:
def is_valid(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

In [17]:
def get_all_imgs(url):
    soup = bs(requests.get(url).content, "html.parser")
    urls = []
    for img in tqdm(soup.find_all("img"), "Extracting images"):
        img_url = img.attrs.get("src")
        if not img_url:
            continue  # skip if img tag contains no src attr
        img_url = urljoin(url, img_url)
        try:
            pos = img_url.index("?")
            img_url = img_url[:pos]
        except ValueError:
            pass
        if is_valid(img_url):
            urls.append(img_url)
    return urls

In [18]:
def download(url, pathname):
    """
    Download the file with the given URL, 
    and puts it in the folder followed by the pathname
    """
    if not os.path.isdir(pathname):
        os.makedirs(pathname)
        
    # Download the body of response by chunk, not immediately
    response = requests.get(url, stream=True)
    
    # Get the total file size
    file_size = int(response.headers.get("Content-Length", 0))
    
    # Get the file name
    filename = os.path.join(pathname, url.split("/")[-1])
    
    # Progress bar, changing the unit to bytes instead of iteration,
    # Which is default by tqdm
    progress = tqdm(response.iter_content(1024), \
                    f"Downloading{filename}", \
                    total=file_size, \
                    unit="B", \
                    unit_scale=True, \
                    unit_divisor=1024)
    
    with open(filename, "wb") as f:
        for data in progress.iterable:
            f.write(data)
            progress.update(len(data))

In [19]:
def main(url, path):
    imgs = get_all_imgs(url)
    for img in imgs:
        download(img, path)

In [None]:
main("https://unsplash.com/", "test")

Extracting images: 100%|████████████████████████████████████████████████████████████████████| 104/104 [00:00<00:00, 59697.22it/s]
Downloadingtest/p: 100%|██████████████████████████████████████████████████████████████████████| 43.0/43.0 [00:00<00:00, 12.2kB/s]
Downloadingtest/photo-1660866838212-df428c885827: 100%|█████████████████████████████████████| 19.8M/19.8M [00:08<00:00, 2.37MB/s]
Downloadingtest/photo-1662581871665-f299ba8ace07: 100%|█████████████████████████████████████| 14.8M/14.8M [00:05<00:00, 2.84MB/s]
Downloadingtest/photo-1662733625046-ac900bb77091: 100%|█████████████████████████████████████| 17.0M/17.0M [00:08<00:00, 2.10MB/s]
Downloadingtest/photo-1662729429584-ff91702c2c87: 100%|█████████████████████████████████████| 16.8M/16.8M [00:07<00:00, 2.38MB/s]
Downloadingtest/photo-1662725461878-08cf9a790168: 100%|██████████████████████████████████████| 3.05M/3.05M [00:03<00:00, 918kB/s]
Downloadingtest/photo-1662735142465-c443674e0390: 100%|███████████████████████████████████

Downloadingtest/photo-1661956602153-23384936a1d3: 100%|█████████████████████████████████████| 7.29M/7.29M [00:04<00:00, 1.87MB/s]
Downloadingtest/profile-fb-1525756603-79e579036b2d.jpg: 100%|███████████████████████████████| 41.5k/41.5k [00:00<00:00, 72.4kB/s]
Downloadingtest/photo-1662729429584-ff91702c2c87: 100%|█████████████████████████████████████| 16.8M/16.8M [00:07<00:00, 2.42MB/s]
Downloadingtest/profile-1538664356252-59adf811daa6: 100%|██████████████████████████████████████| 400k/400k [00:01<00:00, 348kB/s]
Downloadingtest/photo-1662810331468-a260f67a936f: 100%|█████████████████████████████████████| 9.99M/9.99M [00:05<00:00, 2.08MB/s]
Downloadingtest/profile-1662724645903-b5fe2f155627image: 100%|█████████████████████████████████| 744k/744k [00:01<00:00, 407kB/s]
Downloadingtest/photo-1662725461878-08cf9a790168: 100%|█████████████████████████████████████| 3.05M/3.05M [00:02<00:00, 1.07MB/s]
Downloadingtest/profile-1662280031968-d74d65cd9d71image: 100%|████████████████████████████