## Links Checker v4.1
**This Python program uses web scraping to create a site map and find every link on every page to check the response of the URLs and make sure they are active.**

1) Run the following cells in order. 

In [1]:
#%% import libraries
import requests
import certifi
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlsplit
import pandas as pd
from IPython.display import display
import ipywidgets as widgets
import warnings
from datetime import datetime as dt
import time
import os

2) Enter the starting URL here. Add any links to skip from including in the site map, in quotes separated by a comma. This could include links to Facebook or LinkedIn, for example.

In [2]:
# starting URL goes here
base_url = "https://www.example.com" 

# enter any links to ignore here
skip_links = ["https://www.facebook.com", "https://www.linkedin.com"]

3) Run the cell below to create functions and session settings.

In [3]:
#%% set web agent variables and headers

# add browsers headers
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
    "Referer": f"{base_url}",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-US,en;q=0.9"
}

session = requests.Session()
session.headers.update(headers)

#%% create site map from base url

def site_mapper(base_url, mapped=None):

    if mapped is None:
        mapped = set()

    if not base_url.endswith("/"): # add / to all links for consistency
        base_url += "/"

    response = requests.get(base_url, verify=certifi.where())
    response.raise_for_status()
    soup = BeautifulSoup(response.content, "html.parser")

    urls_dict = {}

    links = set()

    # parse html and find all links
    for anchor in soup.find_all("a"):
        href = anchor.attrs.get("href")
        
        if href:
            link = urljoin(base_url, href)

            fragment = urlsplit(link).fragment # skip any fragments on page
            if fragment:
                continue
            
            # only include links that branch from the base url, skip if already in set
            if link.startswith(base_url) and link not in mapped:
                if link != base_url:
                    links.add(link)

    mapped.add(base_url)
    urls_dict[base_url] = links

    # recursive call to iterate through site map
    for link in links:
        if link not in mapped:
            sub_links = site_mapper(link, mapped)
            urls_dict.update(sub_links)

    return urls_dict


#%% read HTML and extract all urls from page

def get_links(base_urls, skip_links=skip_links):

    urls_dict = {}

    for base_url in base_urls:

        response = requests.get(base_url, verify=certifi.where())
        soup = BeautifulSoup(response.content, "html.parser")
        
        links = set()

        # parse html and find all links
        for anchor in soup.find_all("a"):
            href = anchor.attrs.get("href")
            
            if href:
                link = urljoin(base_url, href)
                
                fragment = urlsplit(link).fragment # skip any fragments on page
                if fragment:
                    continue
                
                if not link.startswith("mailto"): # skip links to email adresses
                    if link not in skip_links:
                        if link != base_url:
                            links.add(link)
        
        urls_dict[f"{base_url}"] = links

    
    return urls_dict

4) Run the cell below to create a site map and collect all links. Warning: will take several minutes to compelte.

In [4]:
# create site map
sitemap = site_mapper(base_url) 

# get all links from URLs in site map
site_links = get_links(sitemap.keys()) 

5) Run this cell to check the status of each link. **Warning: this can take up to several hours.**

In [5]:
#%% check every url and store link status in a table

warnings.filterwarnings("ignore")

# show progress bar
progress = 0
progress_bar = widgets.IntProgress(
    value = progress,
    min = 0,
    max = len(site_links.keys()),
    description = "Loading: ",
    bar_style = "success",
    style = {"bar_color": "blue"},
    orientation = "horizontal"
)
display(progress_bar)

dict = {"Base URL": [], "Link URL": [], "Link Status": [], "Status Code": [], "Response": []}

for base_url, link_set in site_links.items():
    progress += 1
    progress_bar.value = progress

    for link in link_set:
    
        dict["Base URL"].append(base_url)
        dict["Link URL"].append(link)
    
        # try-except-pass
        retry = 1
        while retry < 5:
            try:
                response = requests.head(link, allow_redirects=True, headers=headers, verify=certifi.where())
                retry = 5
            except requests.exceptions.SSLError as e:
                if retry == 4:
                    response = "SSLError"
                    retry += 1
                else:
                    time.sleep(1 * (2 * retry))
                    retry += 1
            except:
                response = None

        # set response from link
        if response == "SSLError":
            dict["Link Status"].append("SSL Error")
            dict["Status Code"].append("SSL Error")
            dict["Response"].append("SSL Error")
        elif response == None:
            dict["Link Status"].append("Error")
            dict["Status Code"].append("Error")
            dict["Response"].append("Error")
        else:
            status_code = response.status_code
            
            if status_code == 200:
                dict["Link Status"].append("Active")
            else:
                dict["Link Status"].append("Inactive")
            
            dict["Status Code"].append(status_code)
            
            # define response code
            match status_code:
                case 200:
                    dict["Response"].append("Success")
                case 400:
                    dict["Response"].append("Bad Request")
                case 401:
                    dict["Response"].append("Unauthorized")
                case 403:
                    dict["Response"].append("Forbidden")
                case 404:
                    dict["Response"].append("Not Found")
                case 408:
                    dict["Response"].append("Request Timeout")
                case 500:
                    dict["Response"].append("Internal Server Error")
                case 502:
                    dict["Response"].append("Bad Gateway")
                case 503:
                    dict["Response"].append("Service Unavilable")
                case 504:
                    dict["Response"].append("Gateway Timeout")
                case _:
                    dict["Response"].append("Undefined")
    

IntProgress(value=0, bar_style='success', description='Loading: ', max=1, style=ProgressStyle(bar_color='blue'…

6) Display and export the link status table to a CSV file.

In [None]:
#%% create dataframe and display results

links_status_df = pd.DataFrame(dict)
display(links_status_df)

Unnamed: 0,Base URL,Link URL,Link Status,Status Code,Response
0,https://www.example.com/,https://www.iana.org/domains/example,Active,200,Success


In [None]:
#%% export links table

current_date = dt.today().strftime("%m-%d-%y")
links_status_df.to_csv(os.path.join(os.getcwd(), f"LinksChecker_Output_{current_date}.csv"), index=False)