In [None]:
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, urlparse

base_url = "https://python.langchain.com"
sections = [
    "/docs/get_started/introduction",
    "/docs/tutorials",
    "/docs/how_to",
    "/docs/concepts"
]

visited_urls = set()
output_dir = "langchain_selected_docs"
os.makedirs(output_dir, exist_ok=True)

def fetch_and_save(url):
    """Fetch page, extract text from <main>, and save it to a .txt file."""
    try:
        print(f"Fetching: {url}")
        res = requests.get(url)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, "html.parser")
        main = soup.find("main")
        if main:
            text = main.get_text(separator="\n").strip()
            parsed = urlparse(url)
            filename = parsed.path.strip("/").replace("/", "_") or "index"
            with open(os.path.join(output_dir, f"{filename}.txt"), "w", encoding="utf-8") as f:
                f.write(text)
        return soup
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

def extract_internal_links(soup, section_root):
    """Extract only direct internal child links within the section."""
    root_parts = section_root.strip("/").split("/")
    links = set()
    for a in soup.find_all("a", href=True):
        href = a["href"].split("#")[0]
        if not href.startswith("/"):
            continue
        if href.startswith(section_root):
            parts = href.strip("/").split("/")
            if len(parts) == len(root_parts) + 1:
                full_url = urljoin(base_url, href)
                links.add(full_url)
    return links


for section in sections:
    section_url = urljoin(base_url, section)
    if section_url in visited_urls:
        continue
    visited_urls.add(section_url)

    soup = fetch_and_save(section_url)
    if soup:
        child_links = extract_internal_links(soup, section)
        for child_url in child_links:
            if child_url not in visited_urls:
                visited_urls.add(child_url)
                fetch_and_save(child_url)


Fetching: https://python.langchain.com/docs/get_started/introduction
Fetching: https://python.langchain.com/docs/tutorials
Fetching: https://python.langchain.com/docs/tutorials/agents/
Fetching: https://python.langchain.com/docs/tutorials/rag/
Fetching: https://python.langchain.com/docs/tutorials/classification/
Fetching: https://python.langchain.com/docs/tutorials/chatbot/
Fetching: https://python.langchain.com/docs/tutorials/retrievers/
Fetching: https://python.langchain.com/docs/tutorials/qa_chat_history/
Fetching: https://python.langchain.com/docs/tutorials/summarization/
Fetching: https://python.langchain.com/docs/tutorials/graph/
Fetching: https://python.langchain.com/docs/tutorials/extraction/
Fetching: https://python.langchain.com/docs/tutorials/llm_chain/
Fetching: https://python.langchain.com/docs/tutorials/sql_qa/
Fetching: https://python.langchain.com/docs/how_to
Fetching: https://python.langchain.com/docs/how_to/fallbacks/
Fetching: https://python.langchain.com/docs/how_to

In [4]:
# MERGING

input_dir = "langchain_selected_docs"
output_file = "merged_langchain_docs.txt"

with open(output_file, "w", encoding="utf-8") as outfile:
    for filename in sorted(os.listdir(input_dir)):
        if filename.endswith(".txt"):
            file_path = os.path.join(input_dir, filename)
            with open(file_path, "r", encoding="utf-8") as infile:
                outfile.write(f"\n--- {filename} ---\n\n")
                outfile.write(infile.read())
                outfile.write("\n\n")


In [None]:
#/docs/get_started/introduction#installation

# import requests
# from bs4 import BeautifulSoup
# from urllib.parse import urljoin

# base_url = "https://python.langchain.com"
# how_to_index_url = urljoin(base_url, "/docs/how_to")

# output_file = "langchain_how_to_qa.txt"

# def get_main_text(url):
#     """Extract text from <main> tag of a page."""
#     try:
#         res = requests.get(url)
#         res.raise_for_status()
#         soup = BeautifulSoup(res.text, "html.parser")
#         main = soup.find("main")
#         return main.get_text(separator="\n").strip() if main else ""
#     except Exception as e:
#         print(f"Error getting {url}: {e}")
#         return ""

# def extract_how_to_links():
#     """Extract 'How to:' entries and their links from the index page."""
#     res = requests.get(how_to_index_url)
#     soup = BeautifulSoup(res.text, "html.parser")
#     main = soup.find("main")

#     links = []
#     for a in main.find_all("a", href=True):
#         if a.text.strip().startswith("How to:"):
#             full_url = urljoin(base_url, a['href'].split("#")[0])
#             links.append((a.text.strip(), full_url))
#     return links

# def build_qa_dataset():
#     how_to_entries = extract_how_to_links()
#     with open(output_file, "w", encoding="utf-8") as f:
#         for title, url in how_to_entries:
#             print(f"Processing: {title}")
#             content = get_main_text(url)
#             f.write(f"{title}\n{content}\n\n")

# build_qa_dataset()


Processing: How to: install LangChain packages
Processing: How to: use LangChain with different Pydantic versions
Processing: How to: return structured data from a model
Processing: How to: use a model to call tools
Processing: How to: stream runnables
Processing: How to: debug your LLM apps
Processing: How to: do function/tool calling
Processing: How to: get models to return structured output
Processing: How to: cache model responses
Processing: How to: get log probabilities
Processing: How to: create a custom chat model class
Processing: How to: stream a response back
Processing: How to: track token usage
Processing: How to: track response metadata across providers
Processing: How to: use chat model to call tools
Processing: How to: stream tool calls
Processing: How to: handle rate limits
Processing: How to: few shot prompt tool behavior
Processing: How to: bind model-specific formatted tools
Processing: How to: force a specific tool call
Processing: How to: work with local models
Pr