## Callysto Course Link Checker 

**Description**: This is a notebook that iterates through directory files and runs a validation check on the links contained in Jupyter Notebooks. This notebook was written for Callysto internal use.

**Usage**: Run this notebook in the parent directory containing the notebooks or in the directory itself.

**Notes**: This notebook takes time and you will know when it is done by it's termination statement. It will return only error messages. It is only capable of handling conventional urls starting with https:// or www.

Last Edited: June 16, 2020

Author: LNC

Contact: lisa.cao@cybera.ca

In [None]:
# run only once if needed
# !pip3 install urllib3

In [None]:
# import libraries/modules - all are default except urllib3
import os
import json
import re
import urllib3

In [None]:
## function to parse urls (from geeksforgeeks)
def url_parse(string): 
    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    url = re.findall(regex,string) # find all instances      
    return [x[0] for x in url] # append to list

In [None]:
## search through all directories and parse cells
def url_check():
    for root, dirs, files in os.walk("."):
        for filename in files:
            if filename.endswith('.ipynb'): # select notebooks
                file = os.path.join(root, filename)
                notebook = json.load(open(file)) # load notebook json
                cell_number = 0
                for cell in notebook['cells']:
                    cell_number += 1 # cell counter for output
                    try:
                        cell_contents = cell['source'][0] # parse json
                    except IndexError: # error handling for json index out of range
                        pass
                    cell_urls = url_parse(cell_contents) # extract urls into list
                    for url in cell_urls: 
                        http = urllib3.PoolManager() # init pool - req' for request sending
                    try:
                        req = http.request('GET', url, timeout = 5.0, retries = False)
                        if req.status < 400 or req.status == 429: # assess http status code, note 429 means too many requests
                            pass
                        else: # for server errors
                            print("BROKEN URL in",file, ": Cell", cell_number, url, "\n    HTTP Status:", req.status, "\n")
                    except Exception as e: # for timeout urllib errors and bad url formats
                        print("BROKEN URL in",file, ": Cell", cell_number, url, "\n    reason:", e, "\n")
    print(".. CHECK COMPLETE")

In [None]:
url_check() 