In [None]:
# Install required packages
%pip install -qU langchain-community beautifulsoup4 lxml langchain-ollama

In [None]:
# import os
# from dotenv import load_dotenv

# load_dotenv('./../env')
# os.environ['LANGSMITH_ENDPOINT']

In [None]:

import re
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from bs4 import BeautifulSoup

# Step 1: Load Documents from Python Documentation
def bs4_extractor(html: str) -> str:
    """Custom extractor to clean HTML content"""
    soup = BeautifulSoup(html, "lxml")
    # Remove script, style, and navigation elements
    for script in soup(["script", "style", "nav", "header", "footer"]):
        script.decompose()
    return soup.get_text(strip=True)

loader = RecursiveUrlLoader(
    "https://docs.python.org/3/tutorial/",  # Specific tutorial section
    max_depth=2,  # Crawl up to 2 levels deep
    extractor=bs4_extractor,  # Use custom extractor for cleaner text
    timeout=15,  # Increased timeout
    prevent_outside=True  # Stay within the Python docs domain
)

# Load the documents
docs = loader.load()

# Print basic document information
print(f"Total documents loaded: {len(docs)}")
for doc in docs[:3]:  # Print first 3 document details
    print(f"Source: {doc.metadata.get('source', 'N/A')}")
    print(f"Title: {doc.metadata.get('title', 'N/A')}")
    print(f"Content length: {len(doc.page_content)} characters\n")



In [None]:
# Step 2: Set up Ollama for Document Analysis
# Ensure you've pulled the model first: `ollama pull llama3`
llm = OllamaLLM(model='llama3.2:1b')

# Step 3: Create a Chain for Documentation Analysis
analysis_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an expert Python documentation analyst."),
    ("human", """Carefully analyze the Python tutorial documentation and provide:
    1. Key learning points for Python beginners
    2. Important programming concepts covered
    3. Unique features of Python highlighted in the documentation
    4. Recommended learning path

    Documentation:
    {docs}""")
])

# Create the analysis chain
analysis_chain = analysis_prompt | llm | StrOutputParser()

# Combine first few documents to stay within token limits
combined_docs = " ".join([doc.page_content for doc in docs[:5]])

# Perform analysis
analysis_result = analysis_chain.invoke({"docs": combined_docs})
print("\n--- Python Documentation Analysis ---")
print(analysis_result)



In [None]:
# Step 4: Create an Interactive Documentation Query Chain
query_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful AI assistant specializing in Python documentation."),
    ("human", "Based on the documentation, {query}")
])

query_chain = query_prompt | llm | StrOutputParser()

# Example interactive queries
queries = [
    "Explain the key differences between lists and tuples in Python",
    "Describe the basic syntax for creating functions",
    "What are the main control flow tools in Python?"
]

print("\n--- Interactive Documentation Queries ---")
for query in queries:
    print(f"\nQuery: {query}")
    response = query_chain.invoke({"query": query, "docs": combined_docs})
    print("Response:", response)



In [None]:
# Step 5: Extract Structured Information
def extract_key_sections(documents):
    """Extract key sections from the documentation"""
    sections = {
        "basic_syntax": [],
        "data_structures": [],
        "functions": [],
        "control_flow": [],
        "classes": []
    }
    
    for doc in documents:
        content = doc.page_content.lower()
        
        # Identify and categorize sections
        if re.search(r'\bdef\b|\bfunction\b|\bmethod\b', content):
            sections["functions"].append(doc)
        
        if re.search(r'\blist\b|\btuple\b|\bdict\b|\bset\b', content):
            sections["data_structures"].append(doc)
        
        if re.search(r'\bif\b|\belse\b|\bwhile\b|\bfor\b|\bbreak\b|\bcontinue\b', content):
            sections["control_flow"].append(doc)
        
        if re.search(r'\bclass\b|\binheritance\b|\bobject-oriented\b', content):
            sections["classes"].append(doc)
        
        # Basic syntax detection
        if re.search(r'\bprint\b|\bassignment\b|\bindentation\b', content):
            sections["basic_syntax"].append(doc)
    
    return sections

# Extract key sections
key_sections = extract_key_sections(docs)

# Print out the extracted sections
print("\n--- Extracted Documentation Sections ---")
for section, section_docs in key_sections.items():
    print(f"\n{section.replace('_', ' ').title()} Section:")
    print(f"Number of documents: {len(section_docs)}")
    
    # Print sources for each section
    for doc in section_docs[:2]:  # Print first 2 sources per section
        print(f"- Source: {doc.metadata.get('source', 'N/A')}")

# Bonus: Create a summary of each section
section_summary_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an expert Python documentation analyst."),
    ("human", """Provide a concise summary of the {section_name} section based on the following content:

{section_content}

Focus on key points, important concepts, and practical usage.""")
])

section_summary_chain = section_summary_prompt | llm | StrOutputParser()

print("\n--- Section Summaries ---")
for section_name, section_docs in key_sections.items():
    # Combine content of documents in this section
    section_content = " ".join([doc.page_content for doc in section_docs[:3]])
    
    print(f"\n{section_name.replace('_', ' ').title()} Summary:")
    try:
        summary = section_summary_chain.invoke({
            "section_name": section_name.replace('_', ' '),
            "section_content": section_content
        })
        print(summary)
    except Exception as e:
        print(f"Error generating summary: {e}")

