<a href="https://colab.research.google.com/github/muhammadabdulbaqi/crawling_sitemap/blob/main/crawling_sitemap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install beautifulsoup4 requests




In [9]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from datetime import datetime
import xml.etree.ElementTree as ET

# Function to get all links on a page within the specified path
def get_links(url, base_url, allowed_path, visited):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = set()

        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            absolute_url = urljoin(base_url, href)  # Convert relative URLs to absolute
            parsed_url = urlparse(absolute_url)

            # Include only internal links within the allowed path
            if parsed_url.netloc == urlparse(base_url).netloc and absolute_url.startswith(allowed_path):
                links.add(absolute_url)

        return links
    except Exception as e:
        print(f"Error while crawling {url}: {e}")
        return set()

# Sitemap generator function
def generate_sitemap(base_url, allowed_path):
    to_visit = {allowed_path}
    visited = set()
    all_links = set()

    while to_visit:
        current_url = to_visit.pop()
        if current_url not in visited:
            #print(f"Crawling: {current_url}")
            visited.add(current_url)
            links = get_links(current_url, base_url, allowed_path, visited)
            to_visit.update(links - visited)
            all_links.update(links)

    return all_links

# Write sitemap to XML file
def write_sitemap_to_xml(links, output_file="sitemap.xml"):
    now = datetime.now().isoformat()
    xml_content = '<?xml version="1.0" encoding="UTF-8"?>\n'
    xml_content += '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n'

    for link in sorted(links):
        xml_content += "  <url>\n"
        xml_content += f"    <loc>{link}</loc>\n"
        xml_content += f"    <lastmod>{now}</lastmod>\n"
        xml_content += "    <changefreq>daily</changefreq>\n"
        xml_content += "    <priority>0.8</priority>\n"
        xml_content += "  </url>\n"

    xml_content += "</urlset>\n"

    with open(output_file, "w") as file:
        file.write(xml_content)
    print(f"Sitemap saved to {output_file}")

# Function to extract URLs from XML content
def extract_urls_from_xml(file_path):
    try:
        with open(file_path, "r") as file:
            xml_content = file.read()
        # Parse the XML content
        root = ET.fromstring(xml_content)
        # Extract URLs from <loc> tags
        urls = [loc.text for loc in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")]
        return urls
    except ET.ParseError as e:
        print(f"XML parsing error: {e}")
        return []
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return []

# Main
if __name__ == "__main__":
    # Base URL and allowed path
    base_url = "https://www.enterprisedb.com/"
    allowed_path = "https://www.enterprisedb.com/docs/epas/11/"

    # Generate sitemap
    sitemap_links = generate_sitemap(base_url, allowed_path)
    sitemap_file = "sitemap.xml"
    write_sitemap_to_xml(sitemap_links, sitemap_file)

    # Extract URLs from the generated sitemap and save them to a text file
    urls = extract_urls_from_xml(sitemap_file)
    if urls:
        output_file = "urls.txt"
        with open(output_file, "w") as f:
            for url in urls:
                f.write(url + "\n")
        print(f"Extracted URLs saved to {output_file}")
    else:
        print("No URLs found or XML is invalid.")
