# Setup

In [1]:
from urllib.parse import urljoin, urlparse
from IPython.display import HTML
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

# Helper functions

In [2]:
def get_links_to_internal_urls(path, netloc):
    url = urljoin(f"https://{netloc}",path)
    response = requests.get(url)
    if response.status_code != 200:
        message = f'HTTP {response.status_code}: {url} '
        print(message)
        raise ValueError(message)
    soup = BeautifulSoup(
        response.content,
        'html.parser'
    )
    links = [urlparse(l.get("href")) for l in soup.find_all('a') if l is not None and l.get("href") is not None]
    return [
        l.path for l in links
        if l.netloc=='' or l.netloc == netloc
    ]

def show_urls(url_list):
    return HTML(url_list.sort_index().to_html(render_links=True, escape=False))

# Goal

We want to quickly get all broken URLs from a website. The steps will be
1. Submit the base URL
2. Recursively follow the links to get an exhaustive list of all pages
3. If a webpage returns a non 200 result, add it to the list of broken URLs

In [3]:
starting_url = urlparse('https://collapsedwave.com/')
netloc = starting_url.netloc
results = pd.DataFrame(
    data=[[False, False]],
    columns=["crawled", "broken"], index=[starting_url.path]
)
results

Unnamed: 0,crawled,broken
/,False,False


In [4]:
with tqdm() as pbar:
    while not results[results["crawled"]==False].empty:
        current_url = results[results["crawled"]==False].index[0]
        try:
            new_links = get_links_to_internal_urls(path=current_url, netloc=netloc)
            new_urls = set(new_links).difference(set(results.index))
            # print(urljoin(f"https://{netloc}", current_url))
            # print(new_urls)
            for new_url in new_urls:
                results.loc[new_url] = [False, False]
            results.loc[current_url] = [True, False]
        except:
            results.loc[current_url] = [True, True]
        pbar.update(1)
        seen = results[results['crawled']==True].shape[0]
        failed = results[results['broken']==True].shape[0]
        remaining = results[results['crawled']==False].shape[0]
        pbar.total = seen + failed + remaining
        pbar.set_description(f"{seen} seen | {failed} failed | {remaining} remaining")

# Adds the complete url after building the fully explored graph
results["url"] = [urljoin(f"https://{netloc}",l) for l in list(results.index)]

0it [00:00, ?it/s]

HTTP 404: https://collapsedwave.com/Diffeomorphism 




HTTP 404: https://collapsedwave.com/Riesz-Representation-Theorem 
HTTP 404: https://collapsedwave.com/Implicit-Function-Theorem 
HTTP 404: https://collapsedwave.com/acceleration 
HTTP 404: https://collapsedwave.com/mass 
HTTP 404: https://collapsedwave.com/force 
HTTP 404: https://collapsedwave.com/wave-function 
HTTP 404: https://collapsedwave.com/quantum-state-vector 
HTTP 404: https://collapsedwave.com/unit-vector 
HTTP 404: https://collapsedwave.com/quantum-system-state 
HTTP 404: https://collapsedwave.com/Plancks-reduced-constant 
HTTP 404: https://collapsedwave.com/cauchy-sequence 
HTTP 404: https://collapsedwave.com/point-separation 
HTTP 404: https://collapsedwave.com/triangle-inequality 
HTTP 404: https://collapsedwave.com/vector 
HTTP 404: https://collapsedwave.com/field 
HTTP 404: https://collapsedwave.com/scalar 
HTTP 404: https://collapsedwave.com/absolute-homogeneity 
HTTP 404: https://collapsedwave.com/binary-operation 
HTTP 404: https://collapsedwave.com/subfield 
HTTP 

In [5]:
# Broken
show_urls(results[results["broken"]==True])

Unnamed: 0,crawled,broken,url
./Artificial-Neural-Network,True,True,https://collapsedwave.com/Artificial-Neural-Network
./CNOT,True,True,https://collapsedwave.com/CNOT
./Critical-Points,True,True,https://collapsedwave.com/Critical-Points
./DBSCAN,True,True,https://collapsedwave.com/DBSCAN
./Diffeomorphism,True,True,https://collapsedwave.com/Diffeomorphism
./Dual-Tree-Boruvka,True,True,https://collapsedwave.com/Dual-Tree-Boruvka
./EPR,True,True,https://collapsedwave.com/EPR
./Elman-Networks,True,True,https://collapsedwave.com/Elman-Networks
./Energy-function,True,True,https://collapsedwave.com/Energy-function
./Equation-of-motion,True,True,https://collapsedwave.com/Equation-of-motion


In [6]:
# External URLs
show_urls(results[list(map(lambda x: urlparse(x).path.startswith('/'), results.index))])

Unnamed: 0,crawled,broken,url
/,True,False,https://collapsedwave.com/
/index.xml,True,False,https://collapsedwave.com/index.xml


In [7]:
# Success
show_urls(results[results["broken"]==False])

Unnamed: 0,crawled,broken,url
,True,False,https://collapsedwave.com
.,True,False,https://collapsedwave.com/
..,True,False,https://collapsedwave.com/
../tags/classical-mechanics,True,False,https://collapsedwave.com/tags/classical-mechanics
../tags/definition,True,False,https://collapsedwave.com/tags/definition
../tags/graph-theory,True,False,https://collapsedwave.com/tags/graph-theory
../tags/machine-learning,True,False,https://collapsedwave.com/tags/machine-learning
../tags/math,True,False,https://collapsedwave.com/tags/math
../tags/physics,True,False,https://collapsedwave.com/tags/physics
../tags/quantum-theory,True,False,https://collapsedwave.com/tags/quantum-theory
