# Setup

In [1]:
from urllib.parse import urljoin, urlparse
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

# Helper functions

In [2]:
def get_links_to_internal_urls(path, netloc):
    url = urljoin(f"https://{netloc}",path)
    response = requests.get(url)
    if response.status_code != 200:
        message = f'HTTP {response.status_code}: {url} '
        print(message)
        raise ValueError(message)
    soup = BeautifulSoup(
        response.content,
        'html.parser'
    )
    links = [urlparse(l.get("href")) for l in soup.find_all('a') if l is not None and l.get("href") is not None]
    return [
        l.path for l in links
        if l.netloc=='' or l.netloc == netloc
    ]

def print_urls(url_list):
    return sorted([urljoin(f"https://{netloc}",l) for l in list(url_list)])

In [3]:
urlparse('void(0)')

ParseResult(scheme='', netloc='', path='void(0)', params='', query='', fragment='')

# Goal

We want to quickly get all broken URLs from a website. The steps will be
1. Submit the base URL
2. Recursively follow the links to get an exhaustive list of all pages
3. If a webpage returns a non 200 result, add it to the list of broken URLs

In [4]:
starting_url = urlparse('https://collapsedwave.com/')
netloc = starting_url.netloc
results = pd.DataFrame(
    data=[[False, False]],
    columns=["crawled", "broken"], index=[starting_url.path]
)
results

Unnamed: 0,crawled,broken
/,False,False


In [5]:
with tqdm() as pbar:
    while not results[results["crawled"]==False].empty:
        current_url = results[results["crawled"]==False].index[0]
        try:
            new_links = get_links_to_internal_urls(path=current_url, netloc=netloc)
            new_urls = set(new_links).difference(set(results.index))
            # print(urljoin(f"https://{netloc}", current_url))
            # print(new_urls)
            for new_url in new_urls:
                results.loc[new_url] = [False, False]
            results.loc[current_url] = [True, False]
        except:
            results.loc[current_url] = [True, True]
        pbar.update(1)
        seen = results[results['crawled']==True].shape[0]
        failed = results[results['broken']==True].shape[0]
        remaining = results[results['crawled']==False].shape[0]
        pbar.set_description(f"{seen} seen | {failed} failed | {remaining} remaining")

0it [00:00, ?it/s]

HTTP 404: https://collapsedwave.com/Riesz-Representation-Theorem 
HTTP 404: https://collapsedwave.com/Implicit-Function-Theorem 
HTTP 404: https://collapsedwave.com/Diffeomorphism 




HTTP 404: https://collapsedwave.com/qubit 
HTTP 404: https://collapsedwave.com/anti-commutative 
HTTP 404: https://collapsedwave.com/Jacobi-identity 
HTTP 404: https://collapsedwave.com/quantum-state-vector 
HTTP 404: https://collapsedwave.com/real-line 
HTTP 404: https://collapsedwave.com/supremum-operator 
HTTP 404: https://collapsedwave.com/conjugate 
HTTP 404: https://collapsedwave.com/interval 
HTTP 404: https://collapsedwave.com/Plancks-reduced-constant 
HTTP 404: https://collapsedwave.com/wave-function 
HTTP 404: https://collapsedwave.com/unit-vector 
HTTP 404: https://collapsedwave.com/quantum-system-state 
HTTP 404: https://collapsedwave.com/scalar-field 
HTTP 404: https://collapsedwave.com/triangle-inequality 
HTTP 404: https://collapsedwave.com/scalar 
HTTP 404: https://collapsedwave.com/subfield 
HTTP 404: https://collapsedwave.com/point-separation 
HTTP 404: https://collapsedwave.com/absolute-homogeneity 
HTTP 404: https://collapsedwave.com/vector 
HTTP 404: https://collap

In [6]:
# Broken
print_urls(results[results["broken"]==True].index)

['https://collapsedwave.com/Artificial-Neural-Network',
 'https://collapsedwave.com/CNOT',
 'https://collapsedwave.com/Critical-Points',
 'https://collapsedwave.com/DBSCAN',
 'https://collapsedwave.com/Diffeomorphism',
 'https://collapsedwave.com/Dual-Tree-Boruvka',
 'https://collapsedwave.com/EPR',
 'https://collapsedwave.com/Elman-Networks',
 'https://collapsedwave.com/Energy-function',
 'https://collapsedwave.com/Equation-of-motion',
 'https://collapsedwave.com/F1-score',
 'https://collapsedwave.com/Fully-recurrent-neural-networks',
 "https://collapsedwave.com/Hamilton's-Equations",
 'https://collapsedwave.com/Hooke`s-law',
 'https://collapsedwave.com/Implicit-Function-Theorem',
 'https://collapsedwave.com/Jacobi-identity',
 'https://collapsedwave.com/Jordan-Networks',
 'https://collapsedwave.com/Lebesgue-measure',
 'https://collapsedwave.com/Leibniz-rule',
 'https://collapsedwave.com/Linear-Operator',
 'https://collapsedwave.com/Operator',
 'https://collapsedwave.com/Partial-Deriva

In [7]:
# External URLs
[u for u in results.index if not urlparse(u).path.startswith('/')]

['',
 './📘-Block-Sphere',
 './📘-Ubiquitous-Language',
 './📘-Commutator',
 './📘-Jacobian',
 './📙-Differential-Equations',
 './📘-Quantum-observable',
 './📙-Legendre-Transform',
 './📘-Quantum-Hilbert-Space',
 './📘-Gradient',
 './📘-Vector-Space',
 './📘-Poisson-bracket',
 './📘-Spanning-tree',
 './📘-Vector-field',
 './📘-Frequency-of-Oscillation',
 './📘-Gauge-Theory',
 './📙-Trotterization',
 './📘-Entropy',
 './📘-Potential-energy',
 './📘-Pauli-Matrix',
 './📘-Quantum-Field-Theory',
 './📘-Curl',
 './📙-HDBSCAN',
 './📕-Postulates-of-Quantum-Mechanics',
 './Riesz-Representation-Theorem',
 './📘-Linear-Operator',
 './📗-Newtons-law',
 './📗-No-cloning-theorem',
 './📘-Harmonic-Oscillator',
 './📘-Damped-Harmonic-Oscillator',
 './📘-Trie',
 './📘-Convex-function',
 './📘-Minimum-spanning-tree',
 './📘-Trajectory',
 './Implicit-Function-Theorem',
 './📘-Kinetic-energy',
 './📘-Metric-space',
 './📘-Del-or-Nabla',
 './📘-Hamiltonian-Operator',
 './📙-k-Nearest-Neighbours',
 './📘-Conservative-force',
 './📘-Laplacian'

In [8]:
# Success
print_urls(results[results["broken"]==False].index)

['https://collapsedwave.com',
 'https://collapsedwave.com/',
 'https://collapsedwave.com/',
 'https://collapsedwave.com/',
 'https://collapsedwave.com/Help-me❓',
 'https://collapsedwave.com/index.xml',
 'https://collapsedwave.com/tags/classical-mechanics',
 'https://collapsedwave.com/tags/classical-mechanics',
 'https://collapsedwave.com/tags/definition',
 'https://collapsedwave.com/tags/definition',
 'https://collapsedwave.com/tags/graph-theory',
 'https://collapsedwave.com/tags/graph-theory',
 'https://collapsedwave.com/tags/machine-learning',
 'https://collapsedwave.com/tags/machine-learning',
 'https://collapsedwave.com/tags/math',
 'https://collapsedwave.com/tags/math',
 'https://collapsedwave.com/tags/physics',
 'https://collapsedwave.com/tags/physics',
 'https://collapsedwave.com/tags/quantum-theory',
 'https://collapsedwave.com/tags/quantum-theory',
 'https://collapsedwave.com/📕-Maxwell-Equations',
 'https://collapsedwave.com/📕-Maxwell-Equations',
 'https://collapsedwave.com/📕-

In [9]:
get_links_to_internal_urls('void(0)', netloc)

HTTP 404: https://collapsedwave.com/void(0) 


ValueError: HTTP 404: https://collapsedwave.com/void(0) 