In [8]:
import os
import os.path
import pathlib
import sys
from pprint import pprint
from bs4 import BeautifulSoup

In [32]:
def get_paths(dir: str, depth: int = 1, exclude=[".git", ".gitignore", "node_modules"]):
    """Traverses the contents of the directory `dir`, and returns the paths of each file system 
    object relative to `dir`. 

    Positional args:
        - dir <str>: The directory to traverse. If `dir` doesn't exist in working directory, 
            the programs raises a `FileExistsError`.
        - depth <int>: The depth of the traversal.
        - exclude <list>: A list of files and directories to exclude from the returned list. By
        default, ignores `.git/`, `.gitignore`, and `node_modules/`.

    Returns a list of files and directories.

    FIXME: Update the name of the function to `get_paths` so it reflects its actual functionality
    TODO: Update `exclude` parameter so that the list items can be regex (or globs?).
    TODO: Add `excludedefault` to optinally exclude the files that are excluded by default (mouthful, lol)
    """

    if not os.path.exists(dir):
        raise FileExistsError("'%s does not exist" % dir)

    # If we've reached the depth limit of the traversal, return; don't traverse anymore
    if depth == 0:
        return []

    dir = pathlib.Path(dir)
    paths = []  # TODO: Should this be named `filenames` or `paths` instead?
    for file in dir.iterdir():
        # Ignore that paths listed in `exclude`.
        if file.name in exclude:
            continue
        
        if file.is_file():
            paths.append(file.name)
        else:
            subdir_paths = get_paths(file, depth - 1)
            # If the subdirectory is empty or we reached the depth limit
            if len(subdir_paths) == 0:
                paths += [file.name + '/']

            # Concatenate the directory path to the returned list of subdirectories
            else:
                for sub_index in range(0, len(subdir_paths)):
                    subdir_paths[sub_index] = file.name + \
                        '/' + subdir_paths[sub_index]
                paths += subdir_paths

    return paths


root = pathlib.Path("..")
depth = 1
paths = get_paths(root, depth=depth)
pprint(paths)

['reading/',
 'snippets/',
 'index.html',
 '.DS_Store',
 'learn-with-me/',
 'about.html',
 'projects/',
 '_/',
 'blog/',
 'favicon.png',
 'package.json',
 'profile-picture.png',
 'style.css',
 '.prettierrc']


In [3]:
def get_links(page: str):
    """
    Returns all the links from a webpage.

    args:
        page <str>: The name of the HTML page that contains anchors.
    """
    with open(page) as f:
        page = BeautifulSoup(f, "html.parser") 
        anchor_tag = "a"
        anchors = page.find_all(anchor_tag)
        links = [anchor.get("href") for anchor in anchors]

        return links
    
links = []
for file in files:
    if "html" in file:
        links += [(file, get_links(file))]

links

[('index.html', ['blog/index.html'])]

In [4]:
def get_broken_links(page):
    """
    Returns a list that contains all the broken links.
    """
    links = get_links(page)
    broken_links = []
    for link in links:
        if not os.path.exists(link):
            broken_links += [link]
    
    return broken_links
    
page = "blog/index.html"
broken_links = get_broken_links(page)
broken_links


['../f-index.html']