In [13]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import pandas as pd

def extract_links(url, visited):
    """
    Extract all unique links from the given URL that have not been visited.
    """
    links = set()
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        for anchor in soup.find_all('a', href=True):
            href = anchor['href']
            full_url = urljoin(url, href)
            if urlparse(full_url).scheme in ['http', 'https'] and full_url not in visited:
                links.add(full_url)
    except Exception as e:
        print(f"Error extracting links from {url}: {e}")
    return links

def get_domain(url):
    """
    Extract the domain name from a URL.
    """
    parsed_url = urlparse(url)
    return parsed_url.netloc

def crawl_website(start_url, max_depth):
    """
    Crawl a website starting from `start_url` and explore up to `max_depth` layers deep.
    """
    visited = set()
    todo = set([start_url])
    unique_links = set()
    level = 0

    while level < max_depth and todo:
        new_todo = set()
        for url in list(todo):
            if url not in visited:
                visited.add(url)
                print(f"Processing Level {level+1}/{max_depth}, URL: {url}")
                sublinks = extract_links(url, visited)
                unique_links.update(sublinks)
                new_todo.update(sublinks)
        todo = new_todo
        level += 1

    # Convert unique links to a DataFrame with two columns: 'Link' and 'Domain'
    data = {'Link': list(unique_links), 'Domain': [get_domain(link) for link in unique_links]}
    df = pd.DataFrame(data)
    return df


In [16]:
# Example usage:
start_url = 'https://governmentassistanceonline.com/'
max_depth = 2
df = crawl_website(start_url, max_depth)
print("\nFinal Links table:\n", df)

Processing Level 1/2, URL: https://governmentassistanceonline.com/
Processing Level 2/2, URL: https://governmentassistanceonline.com/privacy/
Processing Level 2/2, URL: https://governmentassistanceonline.com/about-us
Processing Level 2/2, URL: https://opgcustomerprivacy.com/contact-us/
Processing Level 2/2, URL: https://governmentassistanceonline.com/#content
Processing Level 2/2, URL: https://governmentassistanceonline.com
Processing Level 2/2, URL: https://opgcustomerprivacy.com/marketing-partners/
Processing Level 2/2, URL: https://opgguides.com/faqs/general/
Processing Level 2/2, URL: https://governmentassistanceonline.com/programs/
Processing Level 2/2, URL: https://opgcustomerprivacy.com/do-not-sell-my-information/
Processing Level 2/2, URL: https://opgcustomerprivacy.com/california-privacy-request/
Processing Level 2/2, URL: https://governmentassistanceonline.com/guides/
Processing Level 2/2, URL: https://governmentassistanceonline.com/unsubscribe/?refSite=governmentassistanceon

In [17]:
df['Domain'].unique()

array(['opgguides.com', 'governmentassistanceonline.com',
       'opgcustomerprivacy.com'], dtype=object)