In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin

In [2]:
import logging

# Create a logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create a FileHandler
file_handler = logging.FileHandler('app.log')
file_handler.setLevel(logging.INFO)

# Create a Formatter and add it to the FileHandler
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)

# Add the FileHandler to the logger
logger.addHandler(file_handler)


## Global variable instantiation

In [3]:
url = "https://www.cncbinternational.com/home/en/index.jsp"

In [4]:
url_list = [url]
traversed_url_list = []
all_pdf_links = []

def is_valid_url(url, timeout=5):
    try:
        response = requests.head(url, timeout=timeout, allow_redirects=True)
        if response.status_code < 400:
            logger.info(f"URL {url} is valid")
            return True
        else:
            logger.info(f"URL {url} could not be reached")
            return False
    except requests.exceptions.RequestException:
        logger.info(f"URL {url} could not be reached")
        return False

def recursive_url_fetcher(urls):
    updated_link_list = []

    """
    optimization case: prune urls that have been visited before
    """
    urls = [url for url in urls if url not in traversed_url_list]

    for url in urls:
        """
        base case: list containing .pdf like files add to global and remove from all iterations going forward
        """
        if url.lower().endswith('.pdf'):
            all_pdf_links.append(url)

    urls = [url for url in urls if url not in all_pdf_links]

    if len(urls) <= 0:
        """
        base case: empty list to iterate on direct return
        """
        return
    
    for curr_url in urls:
        if curr_url in traversed_url_list:
            continue
        else:
            traversed_url_list.append(curr_url)
            try:
                # get html and convert to soup
                response = requests.get(curr_url)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    # get all acnhors contain hrefs
                    anchor_list = soup.find_all("a")
                    if len(anchor_list) > 0:
                        for anchor in anchor_list:
                            # get the href link, append root and check if valid link before adding to updated_link_list
                            link = anchor.get("href")
                            if link:
                                full_link = urljoin(url, link)
                                """
                                short circuiting and ignore irrelavant domains without 
                                substring www.cncbinternational.com 
                                """
                                if "www.cncbinternational.com" in str(full_link) and is_valid_url(full_link):
                                    updated_link_list.append(full_link)
            except requests.exceptions.HTTPError as errh:
                logger.info(f"Http Error: {errh}")
            except requests.exceptions.ConnectionError as errc:
                logger.info(f"Error Connecting: {errc}")
            except requests.exceptions.Timeout as errt:
                logger.info(f"Timeout Error: {errt}")
            except requests.exceptions.RequestException as err:
                logger.info(f"Something went wrong: {err}")
    
    recursive_url_fetcher(updated_link_list)


recursive_url_fetcher(url_list)

KeyboardInterrupt: 