In [43]:
import os
import os.path
import pathlib
from pprint import pprint
from bs4 import BeautifulSoup

In [48]:
def get_paths(dir: str, depth: int = 1, exclude=[".git", ".gitignore", "node_modules"]):
    """Traverses the the directory `dir`, and returns a list of paths relative to `dir`. 

    Positional args:
        - dir <str>: The directory to traverse. If `dir` doesn't exist in working directory, 
            the programs raises a `FileExistsError`.
        - depth <int>: The depth of the traversal.
    Optional args: 
        - exclude <list>: A list of files and/or directories to exclude from the returned list. By
        default, ignores `.git/`, `.gitignore`, and `node_modules/`.

    Returns a list of paths to files and directories.

    TODO: Update `exclude` parameter so that the list items can be regex (or globs?).
    TODO: Add `excludedefault` to optinally exclude the files that are excluded by default (mouthful, lol)
    """

    if not os.path.exists(dir):
        raise FileExistsError("'%s does not exist" % dir)

    # If we've reached the depth limit of the traversal, return; don't traverse anymore
    if depth == 0:
        return []

    dir = pathlib.Path(dir)
    paths = []  # TODO: Should this be named `filenames` or `paths` instead?
    for file in dir.iterdir():
        # Ignore that paths listed in `exclude`.
        if file.name in exclude:
            continue
        
        if file.is_file():
            paths.append(file.name)
        else:
            subdir_paths = get_paths(file, depth - 1)
            # If the subdirectory is empty or we reached the depth limit
            if len(subdir_paths) == 0:
                paths += [file.name + '/']

            # Concatenate the directory path to the returned list of subdirectories
            else:
                for sub_index in range(0, len(subdir_paths)):
                    subdir_paths[sub_index] = file.name + \
                        '/' + subdir_paths[sub_index]
                paths += subdir_paths

    return paths


root = pathlib.Path(".")
depth = 2
paths = get_paths(root, depth=depth)
pprint(paths)

['index.html',
 'styles.css',
 'blog/index.html',
 'blog/styles.css',
 'README.md',
 'manager.ipynb',
 'manager.py']


In [78]:
def get_links(path: str):
    """
    Returns an array of links in a webpage.

    Positional args:
        page <str>: The path to an HTML page that contains anchors.
    """
    with open(path) as f:
        page = BeautifulSoup(f, "html.parser") 
        anchor_tag = "a"
        anchors = page.find_all(anchor_tag)
        links = [anchor.get("href") for anchor in anchors]

        return links
    
links = []
for path in paths:
    if "html" in path:
        links += [(path, get_links(path))]

links

[('index.html', ['blog/index.html']),
 ('blog/index.html', ['../index.html', '../f-index.html'])]

In [107]:
def get_broken_links(path: str):
    """
    Returns a list that contains all the broken links in `page`.

    args:
        path <str>: The path to an html page.
    TODO: Should the returned list also contain broken 'jump links'?
    TODO: (1) make this a logging feature.
    """
    links = get_links(path)
    broken_links = []
    for link in links:
        dirname = os.path.dirname(path)
        basename = os.path.basename(path)
        parent = link.count("..")
        truncated_link = link.split('/')[parent:]
        # print(path, ' <', dirname, '> ', link,
        #       ' ', truncated_link, parent)  # (1)

        if not parent:
            if dirname != "": 
                resolved_path = '/'.join([dirname, link])
            else: 
                resolved_path = link
            if not os.path.exists(resolved_path):
                broken_links += [link]
        else:
            # split the directory into its components
            dirname = dirname.split('/')
            # Create a search path
            try:
                # Get the parent directory of the referrant
                search_path = dirname[:-(parent + 1)]
            except IndexError:
                # If the search goes out of bounds, the search directory is the root of the project
                search_path = []

            resource_path = '/'.join(search_path + truncated_link)
            if not os.path.exists(resource_path):
                broken_links += [link]
    return broken_links


print(get_broken_links("blog/index.html"))

blog/index.html   blog   ../index.html   ['index.html'] 1
blog/index.html   blog   ../f-index.html   ['f-index.html'] 1
['../f-index.html']


In [80]:
def scan_directory(path: str):
    """"""
    paths = get_paths(path, depth=2)
    for path in paths:
        basename = os.path.basename(path) # get the basename 
        name, ext = os.path.splitext(basename) # get the filename and extension

        # This kind of filtering should be encapsulated in `get_paths()`
        if ext == ".html":
            print(
                f"{path}: {get_broken_links(path)}"
            )

scan_directory("..")

FileNotFoundError: [Errno 2] No such file or directory: 'snippets/layout.html'