In [None]:
"""

WebScraper: A Python Utility for Downloading Financial-Related Images
This tool enables automated downloading of images related to financial terms using Bing's search capabilities. It is structured to handle multiple search terms, create organized directories, and provide easy access to downloaded files.

Modules Utilized:
os: Manages local directory creation for organizing downloaded files.
Requests: Handles HTTP requests and supports file downloads directly from URLs.
bing_image_downloader: Facilitates downloading images based on specified search terms.
Class Structure and Functionality:

fetch_bing_images(self):

Defines a list of search terms related to financial documents (e.g., "bank statement", "balance sheet").
Creates a directory named "Financial Related Data" if it does not already exist.
Uses bing_image_downloader to search Bing and download up to 100 images per term. Each term's images are saved in organized folders under the main directory.
download_file(self, url, path):

Attempts to download a file from a given URL using HTTP GET requests.
Saves the file to a specified path if the request is successful.
Handles errors and logs any download issues, ensuring smooth operation.

"""

In [1]:
# Final code that downloads images by using the bing_image_downloader library

import os
import requests
from bing_image_downloader import downloader

class WebScraper:
    def fetch_bing_images(self):
        # Define search terms for financial documents (Capable of downloading all types images)
        search_terms = [
            "bank statement",
            "profit and loss statement",
            "balance sheet",
            "financial statement",
            "invoice document",
            "salary slip",
            "passbooks",
            "Affidavit Document"
        ]
        
        # Create output directory for Bing images
        output_dir = "Financial Related Data"
        os.makedirs(output_dir, exist_ok=True)

        # Download images for each search term using Bing Image Downloader
        for term in search_terms:
            downloader.download(term, limit=1, # Limit to download the images 
                                output_dir=output_dir,
                                adult_filter_off=True,
                                force_replace=False,
                                timeout=60)
            print(f"Downloaded images for search term: {term}")

    def download_file(self, url, path):
        try:
            response = requests.get(url, stream=True)
            if response.status_code == 200:
                with open(path, 'wb') as file:
                    file.write(response.content)
                print(f"Downloaded: {path}")
            else:
                print(f"Failed to download {url}")
        except Exception as e:
            print(f"Error in downloading {url}: {e}")

if __name__ == "__main__":
    scraper = WebScraper()
    scraper.fetch_bing_images()



[%] Downloading Images to C:\Users\Aman\Financial Related Data\bank statement


[!!]Indexing page: 1

[%] Indexed 1 Images on Page 1.


[%] Downloading Image #1 from https://templatelab.com/wp-content/uploads/2021/03/RBS-Bank-Statement-TemplateLab.com_.jpg
[%] File Downloaded !



[%] Done. Downloaded 1 images.
Downloaded images for search term: bank statement
[%] Downloading Images to C:\Users\Aman\Financial Related Data\profit and loss statement


[!!]Indexing page: 1

[%] Indexed 1 Images on Page 1.


[%] Downloading Image #1 from https://templatelab.com/wp-content/uploads/2015/11/Profit-and-Loss-33.jpg
[%] File Downloaded !



[%] Done. Downloaded 1 images.
Downloaded images for search term: profit and loss statement
[%] Downloading Images to C:\Users\Aman\Financial Related Data\balance sheet


[!!]Indexing page: 1

[%] Indexed 1 Images on Page 1.


[%] Downloading Image #1 from https://templatelab.com/wp-content/uploads/2016/01/Balance-Sheet-Template-02.jpg
[%] File Downloaded !



In [2]:
# Importing the os module to interact with the file system
import os

# Importing the requests library to handle HTTP requests for file downloading
import requests

# Importing the bing_image_downloader library for downloading images from Bing
from bing_image_downloader import downloader

# Define a class WebScraper to encapsulate the web scraping functionality
class WebScraper:
    # Method to fetch images using the Bing Image Downloader
    def fetch_bing_images(self):
        # List of search terms related to financial documents (used for image searching)
        search_terms = [
            "bank statement",                # Search term for bank statement images
            "profit and loss statement",     # Search term for profit and loss statement images
            "balance sheet",                 # Search term for balance sheet images
            "financial statement",           # Search term for financial statement images
            "invoice document",              # Search term for invoice document images
            "salary slip",                   # Search term for salary slip images
            "passbooks",                     # Search term for passbook images
            "Affidavit Document"             # Search term for affidavit document images
        ]
        
        # Name of the directory where the downloaded images will be stored
        output_dir = "Financial Related Data"
        
        # Create the output directory if it does not already exist
        os.makedirs(output_dir, exist_ok=True)

        # Iterate through each search term and download the images
        for term in search_terms:
            # Download images for the current search term using Bing Image Downloader
            downloader.download(
                term,                        # Search term to use
                limit=1,                     # Number of images to download for each term
                output_dir=output_dir,       # Directory where images will be saved
                adult_filter_off=True,       # Allow safe search to be off (no adult content)
                force_replace=False,         # Do not replace existing files
                timeout=60                   # Maximum time (in seconds) to wait for the download
            )
            # Print a message indicating the completion of the download for the term
            print(f"Downloaded images for search term: {term}")

    # Method to download files from a given URL and save them to a specified path
    def download_file(self, url, path):
        try:
            # Send a GET request to the specified URL with streaming enabled
            response = requests.get(url, stream=True)
            
            # Check if the HTTP response status code indicates success
            if response.status_code == 200:
                # Open the specified path in write-binary mode and save the content
                with open(path, 'wb') as file:
                    file.write(response.content)
                # Print a success message
                print(f"Downloaded: {path}")
            else:
                # Print an error message if the download failed
                print(f"Failed to download {url}")
        except Exception as e:
            # Catch and print any exceptions that occur during the download process
            print(f"Error in downloading {url}: {e}")

# Main execution block to run the WebScraper
if __name__ == "__main__":
    # Create an instance of the WebScraper class
    scraper = WebScraper()
    
    # Call the method to fetch images using Bing Image Downloader
    scraper.fetch_bing_images()


[%] Downloading Images to C:\Users\Aman\Financial Related Data\bank statement


[!!]Indexing page: 1

[%] Indexed 1 Images on Page 1.


[%] Downloading Image #1 from https://templatelab.com/wp-content/uploads/2021/03/RBS-Bank-Statement-TemplateLab.com_.jpg
[%] File Downloaded !



[%] Done. Downloaded 1 images.
Downloaded images for search term: bank statement
[%] Downloading Images to C:\Users\Aman\Financial Related Data\profit and loss statement


[!!]Indexing page: 1

[%] Indexed 1 Images on Page 1.


[%] Downloading Image #1 from https://templatelab.com/wp-content/uploads/2020/06/Quarterly-Profit-Loss-Statement-Template-TemplateLab-scaled.jpg
[%] File Downloaded !



[%] Done. Downloaded 1 images.
Downloaded images for search term: profit and loss statement
[%] Downloading Images to C:\Users\Aman\Financial Related Data\balance sheet


[!!]Indexing page: 1

[%] Indexed 1 Images on Page 1.


[%] Downloading Image #1 from https://templatelab.com/wp-content/uploads/2016/01/Balance-Shee

In [3]:
# Import necessary libraries for file management and HTTP requests
import os
import requests

# Import the Cloudinary library for interacting with Cloudinary's API
import cloudinary
import cloudinary.api

# Configure Cloudinary credentials for authentication
cloudinary.config(
    cloud_name='dhn8djska', 
    api_key='673915579437848',  
    api_secret='C4j2GmvJSj9hwTTMJTvhv3kkneo'
)

def download_image(url, local_path, file_name):
    try:
        # Ensure the local directory exists, create it if not
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        # Construct the full file path where the image will be saved
        file_path = os.path.join(local_path, file_name)

        # Send a GET request to fetch the image from the URL
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            # Write the image content to the local file in chunks
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
            print(f"Downloaded: {file_path}")  # Log the success
        else:
            # Log an error if the response status code is not successful
            print(f"Failed to download {url}: {response.status_code}")
    except Exception as e:
        # Log any exceptions that occur during the download process
        print(f"Error downloading {url}: {e}")

# Function to fetch images and subfolder structure from Cloudinary
def fetch_images_and_subfolders(prefix="", local_folder=""):
    try:
        # Fetch resources (e.g., images) from Cloudinary, filtered by prefix
        resources = cloudinary.api.resources(
            type='upload',  # Resource type set to 'upload'
            prefix=prefix,  # Folder or file prefix to filter resources
            max_results=500  # Max number of results to fetch in one call
        )

        # Dictionary to track how many images are downloaded per folder
        folder_image_count = {}

        # Iterate through each resource (image) returned by the API
        for resource in resources['resources']:
            # Extract the resource's public ID and file format
            public_id = resource['public_id']
            file_extension = resource['format']

            # Determine the relative folder path for the resource
            folder_path = os.path.dirname(public_id).replace(prefix, "").strip("/")

            # Determine the local folder path for saving the resource
            local_subfolder = os.path.join(local_folder, folder_path)
            if local_subfolder not in folder_image_count:
                # Initialize image count for the folder
                folder_image_count[local_subfolder] = 0

            # Increment the image count for unique naming in the folder
            folder_image_count[local_subfolder] += 1
            image_count = folder_image_count[local_subfolder]

            # Create a unique file name for the image
            file_name = f"image_{image_count}.{file_extension}"

            # Download the image using the helper function
            download_image(resource['secure_url'], local_subfolder, file_name)
    except Exception as e:
        # Log any exceptions encountered during the fetching process
        print(f"Error fetching resources for prefix '{prefix}': {e}")

# Function to fetch and download all resources from the root folder
def fetch_and_download_all():
    # Start fetching and downloading images from the root folder of Cloudinary
    fetch_images_and_subfolders(prefix="", local_folder=".")

# Start the process of fetching and downloading images
fetch_and_download_all()


Downloaded: .\Web Scraper/Balance Sheets\image_1.png
Downloaded: .\Web Scraper/Balance Sheets\image_2.jpg


KeyboardInterrupt: 