In [12]:
# Install required packages
%pip install -qU langchain-community beautifulsoup4 lxml langchain-ollama

3573.86s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Note: you may need to restart the kernel to use updated packages.


In [13]:
import os
from dotenv import load_dotenv

load_dotenv('./../env')
os.environ['LANGSMITH_ENDPOINT']

'https://api.smith.langchain.com'

In [14]:

import re
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from bs4 import BeautifulSoup

# Step 1: Load Documents from Python Documentation
def bs4_extractor(html: str) -> str:
    """Custom extractor to clean HTML content"""
    soup = BeautifulSoup(html, "lxml")
    # Remove script, style, and navigation elements
    for script in soup(["script", "style", "nav", "header", "footer"]):
        script.decompose()
    return soup.get_text(strip=True)

loader = RecursiveUrlLoader(
    "https://docs.python.org/3/tutorial/",  # Specific tutorial section
    max_depth=2,  # Crawl up to 2 levels deep
    extractor=bs4_extractor,  # Use custom extractor for cleaner text
    timeout=15,  # Increased timeout
    prevent_outside=True  # Stay within the Python docs domain
)

# Load the documents
docs = loader.load()

# Print basic document information
print(f"Total documents loaded: {len(docs)}")
for doc in docs[:3]:  # Print first 3 document details
    print(f"Source: {doc.metadata.get('source', 'N/A')}")
    print(f"Title: {doc.metadata.get('title', 'N/A')}")
    print(f"Content length: {len(doc.page_content)} characters\n")



Total documents loaded: 18
Source: https://docs.python.org/3/tutorial/
Title: The Python Tutorial — Python 3.13.2 documentation
Content length: 6522 characters

Source: https://docs.python.org/3/tutorial/floatingpoint.html
Title: 15. Floating-Point Arithmetic: Issues and Limitations — Python 3.13.2 documentation
Content length: 12325 characters

Source: https://docs.python.org/3/tutorial/stdlib2.html
Title: 11. Brief Tour of the Standard Library — Part II — Python 3.13.2 documentation
Content length: 13834 characters



In [15]:
# Step 2: Set up Ollama for Document Analysis
# Ensure you've pulled the model first: `ollama pull llama3`
llm = OllamaLLM(model='llama3.2:1b')

# Step 3: Create a Chain for Documentation Analysis
analysis_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an expert Python documentation analyst."),
    ("human", """Carefully analyze the Python tutorial documentation and provide:
    1. Key learning points for Python beginners
    2. Important programming concepts covered
    3. Unique features of Python highlighted in the documentation
    4. Recommended learning path

    Documentation:
    {docs}""")
])

# Create the analysis chain
analysis_chain = analysis_prompt | llm | StrOutputParser()

# Combine first few documents to stay within token limits
combined_docs = " ".join([doc.page_content for doc in docs[:5]])

# Perform analysis
analysis_result = analysis_chain.invoke({"docs": combined_docs})
print("\n--- Python Documentation Analysis ---")
print(analysis_result)




--- Python Documentation Analysis ---
This page contains an extensive collection of Python documentation, covering various topics such as coding style, data structures, control flow tools, and more.

**Introduction**

The page starts with an introduction to Python, including its history, features, and benefits.

**Coding Style**

The next section discusses the importance of coding style in Python development. It provides PEP 8 guidelines for writing readable and maintainable code.

**Data Structures**

This topic covers various data structures used in Python programming, such as lists, dictionaries, sets, and tuples.

* **Lists**: A sequence of items that can be stored, modified, or accessed using indexing.
* **Dictionaries**: An unordered collection of key-value pairs.
* **Sets**: An unordered collection of unique items.
* **Tuples**: A fixed-size, immutable collection of items.

**Control Flow Tools**

This section covers various control flow statements used in Python programming, i

In [16]:
# Step 4: Create an Interactive Documentation Query Chain
query_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful AI assistant specializing in Python documentation."),
    ("human", "Based on the documentation, {query}")
])

query_chain = query_prompt | llm | StrOutputParser()

# Example interactive queries
queries = [
    "Explain the key differences between lists and tuples in Python",
    "Describe the basic syntax for creating functions",
    "What are the main control flow tools in Python?"
]

print("\n--- Interactive Documentation Queries ---")
for query in queries:
    print(f"\nQuery: {query}")
    response = query_chain.invoke({"query": query, "docs": combined_docs})
    print("Response:", response)




--- Interactive Documentation Queries ---

Query: Explain the key differences between lists and tuples in Python
Response: In Python, `list` and `tuple` are two fundamental data structures that can store collections of values. While they share some similarities, there are significant differences between them.

**Key Differences:**

### 1. Ordered vs Unordered

*   **List**: In Python, lists are ordered collections of elements, meaning their order is preserved when inserted or accessed.
*   **Tuple**: Tuples are also ordered collections, but unlike lists, they cannot be changed after creation (more on this later). When a tuple is modified, it creates a new tuple and does not modify the original.

### 2. Indexing and Slicing

*   **List**: In Python, indexing starts from 0, so you can access elements using their position.
*   **Tuple**: Tuples also support indexing and slicing, but with some limitations. You cannot create tuples with negative indices or use slicing to access non-existen

In [17]:
# Step 5: Extract Structured Information
def extract_key_sections(documents):
    """Extract key sections from the documentation"""
    sections = {
        "basic_syntax": [],
        "data_structures": [],
        "functions": [],
        "control_flow": [],
        "classes": []
    }
    
    for doc in documents:
        content = doc.page_content.lower()
        
        # Identify and categorize sections
        if re.search(r'\bdef\b|\bfunction\b|\bmethod\b', content):
            sections["functions"].append(doc)
        
        if re.search(r'\blist\b|\btuple\b|\bdict\b|\bset\b', content):
            sections["data_structures"].append(doc)
        
        if re.search(r'\bif\b|\belse\b|\bwhile\b|\bfor\b|\bbreak\b|\bcontinue\b', content):
            sections["control_flow"].append(doc)
        
        if re.search(r'\bclass\b|\binheritance\b|\bobject-oriented\b', content):
            sections["classes"].append(doc)
        
        # Basic syntax detection
        if re.search(r'\bprint\b|\bassignment\b|\bindentation\b', content):
            sections["basic_syntax"].append(doc)
    
    return sections

# Extract key sections
key_sections = extract_key_sections(docs)

# Print out the extracted sections
print("\n--- Extracted Documentation Sections ---")
for section, section_docs in key_sections.items():
    print(f"\n{section.replace('_', ' ').title()} Section:")
    print(f"Number of documents: {len(section_docs)}")
    
    # Print sources for each section
    for doc in section_docs[:2]:  # Print first 2 sources per section
        print(f"- Source: {doc.metadata.get('source', 'N/A')}")

# Bonus: Create a summary of each section
section_summary_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an expert Python documentation analyst."),
    ("human", """Provide a concise summary of the {section_name} section based on the following content:

{section_content}

Focus on key points, important concepts, and practical usage.""")
])

section_summary_chain = section_summary_prompt | llm | StrOutputParser()

print("\n--- Section Summaries ---")
for section_name, section_docs in key_sections.items():
    # Combine content of documents in this section
    section_content = " ".join([doc.page_content for doc in section_docs[:3]])
    
    print(f"\n{section_name.replace('_', ' ').title()} Summary:")
    try:
        summary = section_summary_chain.invoke({
            "section_name": section_name.replace('_', ' '),
            "section_content": section_content
        })
        print(summary)
    except Exception as e:
        print(f"Error generating summary: {e}")




--- Extracted Documentation Sections ---

Basic Syntax Section:
Number of documents: 13
- Source: https://docs.python.org/3/tutorial/floatingpoint.html
- Source: https://docs.python.org/3/tutorial/stdlib2.html

Data Structures Section:
Number of documents: 14
- Source: https://docs.python.org/3/tutorial/
- Source: https://docs.python.org/3/tutorial/stdlib2.html

Functions Section:
Number of documents: 14
- Source: https://docs.python.org/3/tutorial/
- Source: https://docs.python.org/3/tutorial/floatingpoint.html

Control Flow Section:
Number of documents: 18
- Source: https://docs.python.org/3/tutorial/
- Source: https://docs.python.org/3/tutorial/floatingpoint.html

Classes Section:
Number of documents: 9
- Source: https://docs.python.org/3/tutorial/
- Source: https://docs.python.org/3/tutorial/stdlib2.html

--- Section Summaries ---

Basic Syntax Summary:
This document provides an overview of the fundamental concepts, syntax, and best practices for writing Python code. Here are some