In [1]:
import os.path
import pathlib
from bs4 import BeautifulSoup

In [2]:
def traverse_dir(dir: str, depth: int = 1):
    """Traverses the contents of the directory `dir`, and returns the names of structures 
    in a list.

    Positional args:
        - dir <str>: The directory to traverse. If `dir` doesn't exist in working directory, 
            the programs raises a `FileExistsError`.
        - depth <int>: The depth of the traversal

    Returns a list of files and directories.
    """
    # If we're reached the depth limit of the traversal, return; don't do any more traversal
    if depth == 0:
        return []

    if not os.path.exists(dir):
        raise FileExistsError("'%s does not exist" % dir)

    dir = pathlib.Path(dir)

    filenames = []  # TODO: Should this be named `filenames` or `paths` instead?
    for file in dir.iterdir():
        if file.is_file():
            filenames.append(file.name)
        else:
            offset = len(filenames)
            subdir_filenames = traverse_dir(file, depth - 1)
            if len(subdir_filenames) > 0:
                for sub_index in range(0, len(subdir_filenames)):
                    subdir_filenames[sub_index] = file.name + '/' + subdir_filenames[sub_index]
                filenames += subdir_filenames 
            else:
                filenames += [file.name + '/']
            
    return filenames

root = pathlib.Path(".")
depth = 1
files = traverse_dir(root, depth=depth)
print(files)

['index.html', 'styles.css', 'blog/', 'manager.ipynb', 'manager.py']


In [3]:
def get_links(page: str):
    """
    Returns all the links from a webpage.

    args:
        page <str>: The name of the HTML page that contains anchors.
    """
    with open(page) as f:
        page = BeautifulSoup(f, "html.parser") 
        anchor_tag = "a"
        anchors = page.find_all(anchor_tag)
        links = [anchor.get("href") for anchor in anchors]

        return links
    
links = []
for file in files:
    if "html" in file:
        links += [(file, get_links(file))]

links

[('index.html', ['blog/index.html'])]

In [4]:
def get_broken_links(page):
    """
    Returns a list that contains all the broken links.
    """
    links = get_links(page)
    broken_links = []
    for link in links:
        if not os.path.exists(link):
            broken_links += [link]
    
    return broken_links
    
page = "blog/index.html"
broken_links = get_broken_links(page)
broken_links


['../f-index.html']