In [1]:
import requests
import os
from urllib.parse import urlparse
from pathlib import Path

In [2]:
# Header to mimic being a browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

def download_images(url_file, save_dir):
    # Ensure save directory exists
    Path(save_dir).mkdir(parents=True, exist_ok=True)

    with open(url_file, 'r', encoding='utf-8') as file:
        urls = file.readlines()

    for index, url in enumerate(urls, start=1):
        try:
            # Trim newline and spaces
            url = url.strip()

            # Get the file name from the URL
            file_name = os.path.basename(urlparse(url).path)

            # Check if the file already exists
            if os.path.exists(os.path.join(save_dir, file_name)):
                print(
                    f"({index}/{len(urls)}) File {file_name} already exists. Skipping...")
                continue

            # Make a request to the URL
            response = requests.get(url, headers=headers)

            # Raise an exception if the request was unsuccessful
            response.raise_for_status()

            # Save the image
            with open(os.path.join(save_dir, file_name), 'wb') as f:
                f.write(response.content)

            print(
                f"({index}/{len(urls)}) Downloaded {file_name}")

        except requests.RequestException as e:
            print(f"Failed to download {url}: {e} ({index}/{len(urls)})")

    print("Done downloading images.")

In [3]:
download_images('../data/reddit_wholesomegreentext_posts/image_urls.txt', '../data/images/')

Failed to download https://preview.redd.it/wbjgop36j87a1.png?auto=webp&amp;s=20eb7cd28d81785aa26269b3c2998e1eadda1c1b: 403 Client Error: Forbidden for url: https://preview.redd.it/wbjgop36j87a1.png?auto=webp&amp;s=20eb7cd28d81785aa26269b3c2998e1eadda1c1b (1/4607)
Failed to download https://preview.redd.it/cr5ubrq4g2x41.jpg?auto=webp&amp;s=34cbef2c453f87341f7fa54a44efd9d05253ba71: 403 Client Error: Forbidden for url: https://preview.redd.it/cr5ubrq4g2x41.jpg?auto=webp&amp;s=34cbef2c453f87341f7fa54a44efd9d05253ba71 (2/4607)
Failed to download https://preview.redd.it/268z2rv75cj51.jpg?auto=webp&amp;s=9830fca5691bdbfa5f728ea3b3f29acc4850abc3: 403 Client Error: Forbidden for url: https://preview.redd.it/268z2rv75cj51.jpg?auto=webp&amp;s=9830fca5691bdbfa5f728ea3b3f29acc4850abc3 (3/4607)
Failed to download https://preview.redd.it/fuwr5eoljww11.png?auto=webp&amp;s=63f7387166b2fa66f7cc105a21fbb875937e27f1: 403 Client Error: Forbidden for url: https://preview.redd.it/fuwr5eoljww11.png?auto=webp&