<a href="https://colab.research.google.com/github/marcprojer/websiteCrawler/blob/main/webcrawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install requests beautifulsoup4



In [4]:
!pip install pandas openpyxl



Complete Crawl:

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd

# Function to get the HTML content of a page
def get_html(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

# Function to extract all links from the page
def extract_links(html, base_url):
    soup = BeautifulSoup(html, "html.parser")
    links = []

    for link in soup.find_all('a', href=True):
        href = link['href']
        # Convert relative URLs to absolute URLs
        full_url = urljoin(base_url, href)
        links.append(full_url)

    return links

# Basic web crawler function with data export to Excel
def crawl_website(start_url, max_pages=5, output_file='crawled_data.xlsx'):
    to_visit = [start_url]  # List of URLs to visit
    visited = set()         # Set of visited URLs
    crawled_data = []       # List to store crawled links

    while to_visit and len(visited) < max_pages:
        url = to_visit.pop(0)  # Get the next URL to visit

        if url not in visited:
            print(f"Visiting: {url}")
            html = get_html(url)

            if html is None:
                continue

            visited.add(url)  # Mark the page as visited
            crawled_data.append(url)  # Store visited URL

            links = extract_links(html, url)  # Extract links from the page
            for link in links:
                if link not in visited and link not in to_visit:
                    to_visit.append(link)  # Add new links to the visit list

    # Save the crawled data to an Excel file using pandas
    df = pd.DataFrame(crawled_data, columns=['Crawled URLs'])
    df.to_excel(output_file, index=False)
    print(f"Data saved to {output_file}")

# Start crawling and export to Excel
start_url = "EXAMPLE"
crawl_website(start_url, max_pages=100, output_file='crawled_links.xlsx')


Adjust depth:

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd

# Function to get the HTML content of a page
def get_html(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

# Function to extract all links from the page
def extract_links(html, base_url):
    soup = BeautifulSoup(html, "html.parser")
    links = []

    for link in soup.find_all('a', href=True):
        href = link['href']
        # Convert relative URLs to absolute URLs
        full_url = urljoin(base_url, href)
        links.append(full_url)

    return links

# Basic web crawler function with depth control and data export to Excel
def crawl_website(start_url, max_pages=1, max_depth=1, output_file='crawled_data.xlsx'):
    to_visit = [(start_url, 0)]  # List of (URL, depth) tuples to visit
    visited = set()              # Set of visited URLs
    crawled_data = []            # List to store crawled links

    while to_visit and len(visited) < max_pages:
        url, depth = to_visit.pop(0)  # Get the next URL and its depth

        if url not in visited and depth <= max_depth:
            print(f"Visiting: {url} (Depth: {depth})")
            html = get_html(url)

            if html is None:
                continue

            visited.add(url)  # Mark the page as visited
            crawled_data.append(url)  # Store visited URL

            if depth < max_depth:  # Only extract links if we're below the max depth
                links = extract_links(html, url)  # Extract links from the page
                for link in links:
                    if link not in visited:
                        to_visit.append((link, depth + 1))  # Add new links with incremented depth

    # Save the crawled data to an Excel file using pandas
    df = pd.DataFrame(crawled_data, columns=['Crawled URLs'])
    df.to_excel(output_file, index=False)
    print(f"Data saved to {output_file}")

# Start crawling and export to Excel with depth control
start_url = "EXAMPLE"
crawl_website(start_url, max_pages=300, max_depth=1, output_file='crawled_links_with_depth.xlsx')


Export for first part:

In [None]:
from google.colab import files

files.download('crawled_links.xlsx')

Export for second part:

In [None]:
from google.colab import files

files.download('crawled_links_with_depth.xlsx')