In [None]:
import os
import requests
from bs4 import BeautifulSoup
import html2text
from urllib.parse import urljoin, urlparse
import time
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Base URL for LangChain documentation
BASE_URL = "https://python.langchain.com/docs/"
DOCS_DIR = "langchain_docs"
VISITED_URLS = set()
MAX_DEPTH = 3  # Limit crawling depth to avoid going too far off track
REQUEST_DELAY = 0.5  # Seconds to wait between requests

# Initialize html2text converter
h = html2text.HTML2Text()
h.ignore_links = False  # Keep links as Markdown links
h.ignore_images = True
h.body_width = 0  # Don't wrap lines

def sanitize_filename(name):
    """Sanitizes a string to be a valid filename."""
    name = name.replace("https://", "").replace("http://", "")
    name = "".join([c if c.isalnum() or c in ('.', '_', '-') else '_' for c in name])
    name = name.strip('_.-')
    return name if name else "index"

def get_and_save_page(url, current_depth):
    if url in VISITED_URLS or current_depth > MAX_DEPTH:
        return

    # Only process URLs within the desired docs path
    if not url.startswith(BASE_URL):
        return

    VISITED_URLS.add(url)
    logging.info(f"Processing (Depth {current_depth}): {url}")

    try:
        # Make request with a delay
        time.sleep(REQUEST_DELAY)
        response = requests.get(url, timeout=20, headers={'User-Agent': 'LangChainDocsScraper/1.0'})
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Locate the main content area (article or main tag)
        main_content_area = soup.find('article') or soup.find('main') or soup.body
        
        if not main_content_area:
            logging.error(f"No content area found for {url}")
            return

        # Convert HTML of main content to Markdown
        markdown_content = h.handle(str(main_content_area))

        if not markdown_content.strip():
            logging.warning(f"No text content extracted from {url}")
            return

        # Create a sanitized filename
        parsed_url = urlparse(url)
        path_parts = [part for part in parsed_url.path.split('/') if part]
        
        # Extract relevant path for the filename
        filename_parts = []
        try:
            docs_index = path_parts.index('docs')
            filename_parts = path_parts[docs_index+1:]
        except ValueError:
            filename_parts = path_parts

        filename_base = sanitize_filename("_".join(filename_parts)) or "index"

        # Ensure .md extension
        filename = f"{filename_base}.md" if not filename_base.endswith(".md") else filename_base
        filepath = os.path.join(DOCS_DIR, filename)
        os.makedirs(os.path.dirname(filepath), exist_ok=True)

        # Save the Markdown content
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(f"# Source URL: {url}\n\n")
            f.write(markdown_content)
        logging.info(f"Saved: {filepath}")

        # Process other links on the page
        for link in main_content_area.find_all('a', href=True):
            href = link['href']
            next_url = urljoin(url, href)  # Handle relative URLs

            # Skip anchors and non-HTTP(S) links
            if urlparse(next_url).fragment or not next_url.startswith(('http://', 'https://')):
                continue

            get_and_save_page(next_url, current_depth + 1)

    except requests.exceptions.RequestException as e:
        logging.error(f"Error fetching {url}: {e}")
    except Exception as e:
        logging.error(f"Error processing {url}: {e}", exc_info=True)

if __name__ == "__main__":
    if not os.path.exists(DOCS_DIR):
        os.makedirs(DOCS_DIR)

    # Start with the main docs page and key sections
    initial_urls = [
        "https://python.langchain.com/docs/get_started",
        "https://python.langchain.com/docs/modules",
        "https://python.langchain.com/docs/use_cases",
        "https://python.langchain.com/docs/integrations",
        "https://python.langchain.com/docs/guides",
        "https://python.langchain.com/docs/concepts",
        "https://python.langchain.com/docs/expression_language",
    ]

    # Crawl the documentation starting from the initial URLs
    for start_url in initial_urls:
        get_and_save_page(start_url, 0)

    logging.info(f"Documentation download process complete. Check the '{DOCS_DIR}' directory.")
    logging.info(f"Total unique URLs visited: {len(VISITED_URLS)}")
