RAG Workshop - Session 1 Notebook

# RAG Workshop - Session 1
# Basic Implementation and Visualization

#Hello

In [7]:
# Install required packages
!pip install sentence-transformers chromadb anthropic langchain plotly



In [8]:
!pip install --upgrade chromadb



In [9]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import anthropic
import plotly.express as px
import plotly.graph_objects as go

# Section 1: Understanding Embeddings
# =================================

In [10]:
def setup_embedding_demo():
    """Basic embedding demonstration"""
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Example technical documentation snippets
    docs = [
        "Python functions are defined using the def keyword",
        "Classes in Python support inheritance and polymorphism",
        "Exception handling uses try-except blocks",
        "Python lists are mutable sequences",
        "Docstrings provide documentation for Python code"
    ]

    # Generate embeddings
    embeddings = model.encode(docs)

    # Visualize embeddings similarity
    similarity_matrix = np.inner(embeddings, embeddings)

    # Create heatmap using plotly
    fig = go.Figure(data=go.Heatmap(
        z=similarity_matrix,
        x=[f'Doc {i+1}' for i in range(len(docs))],
        y=[f'Doc {i+1}' for i in range(len(docs))],
        colorscale='Viridis'
    ))

    fig.update_layout(
        title='Document Similarity Matrix',
        xaxis_title='Documents',
        yaxis_title='Documents'
    )

    fig.show()

    return model, docs, embeddings

# Section 2: Basic RAG Implementation
# =================================

### Please create a secret in Colab with name "ANTHROPIC_API_KEY" and value as the "key-of-your-api-key"
### Once you add the name/value key pair in secret, you will need to enable it.. otherwise notebook won't recognize it.

In [11]:
from google.colab import userdata
import numpy as np
import pandas as pd
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
import plotly.graph_objects as go
import anthropic


class BasicRAG:
    def __init__(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.chroma_client = chromadb.Client(Settings(
            persist_directory="./chroma_db"
        ))
        self.collection = self.setup_collection()

        # Retrieve the Anthropic API key
        self.anthropic_api_key = userdata.get('ANTHROPIC_API_KEY')
        if not self.anthropic_api_key:
            raise ValueError("Anthropic API key is missing. Ensure it is set in Colab secrets.")

    def setup_collection(self):
        """Initialize ChromaDB collection with sample docs."""
        collection = self.chroma_client.get_or_create_collection(
            name="documentation_examples"
        )

        # Sample knowledge base
        docs = [
            {"id": "doc_1", "text": "Functions in Python are blocks of reusable code.", "metadata": {"source": "python_functions.md"}},
            {"id": "doc_2", "text": "Error handling in Python uses try-except blocks.", "metadata": {"source": "error_handling.md"}},
            {"id": "doc_3", "text": "Good documentation includes function signatures.", "metadata": {"source": "documentation_guidelines.md"}}
        ]

        # Add documents to collection
        for doc in docs:
            collection.add(
                documents=[doc["text"]],
                metadatas=[doc["metadata"]],
                ids=[doc["id"]]
            )

        # Display the knowledge base
        self.display_knowledge_base(docs)

        return collection

    def display_knowledge_base(self, docs):
        """Display the documents in the knowledge base."""
        print("Knowledge Base Documents:")
        for idx, doc in enumerate(docs):
            print(f"Document {idx + 1}:")
            print(f"Content:\n{doc['text']}")
            print(f"Metadata: {doc['metadata']}")
            print("-" * 50)

    def retrieve_documents(self, query: str, n_results: int = 2):
        """Retrieve relevant documents for a query."""
        results = self.collection.query(
            query_texts=[query],
            n_results=n_results
        )
        return results

    def generate_documentation(self, query: str, style_guide: str = "google"):
        """Generate documentation using RAG and Claude API."""
        # Retrieve relevant documents
        results = self.retrieve_documents(query)
        context = "\n".join(results['documents'][0])

        # Use Anthropic Claude to generate documentation
        client = anthropic.Anthropic(api_key=self.anthropic_api_key)

        # Define the prompt structure
        system_prompt = f"You are a documentation expert. Generate documentation following the {style_guide} style guide."
        user_query = f"""
        Relevant context from similar documentation:
        {context}

        Query to document:
        {query}
        """

        messages = [
           {"role": "user", "content": user_query}
        ]

        print("\nPrompt Sent to Claude:")
        print(system_prompt)
        print(user_query)

        # API call to generate a response
        response = client.messages.create(
            model="claude-3-5-sonnet-20241022",
            system=system_prompt,
            messages=messages,
            max_tokens=1000,  # Number of tokens to generate
            temperature=0.2  # Controls randomness
        )

        # Extract and print the generated response
        print("\nGenerated Response:")
        print(response.content[0].text)  # Access the response content correctly

        return response.content[0].text


    def visualize_retrieval(self, query: str):
        """Visualize the retrieval process."""
        docs = self.collection.get()
        query_embedding = self.model.encode(query)
        doc_embeddings = self.model.encode(docs['documents'])

        # Calculate similarities
        similarities = np.inner(query_embedding, doc_embeddings)

        # Visualize relevance as a bar chart
        self.visualize_relevance_chart(query, docs['metadatas'], similarities)

        # Display the top retrieved documents
        self.display_retrieval_results(query, docs['documents'], similarities)

    def visualize_relevance_chart(self, query: str, docs_metadata, similarities):
        """Create a bar chart to show document relevance."""
        fig = go.Figure(data=go.Bar(
            x=[doc['source'] for doc in docs_metadata],
            y=similarities,
            text=np.round(similarities, 3),
            textposition="auto",
        ))
        fig.update_layout(
            title=f"Relevance of Documents to Query: '{query}'",
            xaxis_title="Documents",
            yaxis_title="Relevance Score",
            yaxis_range=[0, 1]
        )
        fig.show()

    def display_retrieval_results(self, query: str, docs, similarities):
        """Display the retrieved documents with their relevance scores."""
        print(f"\nQuery: {query}")
        print("\nRetrieved Documents:")
        sorted_indices = np.argsort(similarities)[::-1]
        for idx in sorted_indices[:2]:  # Top 2 documents
            print(f"Document {idx + 1} - Relevance Score: {similarities[idx]:.3f}")
            print(f"Content:\n{docs[idx]}")
            print("-" * 50)

    def visualize_similarity_matrix(self, embeddings, docs):
        """Visualize embeddings similarity matrix."""
        similarity_matrix = np.inner(embeddings, embeddings)
        fig = go.Figure(data=go.Heatmap(
            z=similarity_matrix,
            x=[f"Doc {i + 1}" for i in range(len(docs))],
            y=[f"Doc {i + 1}" for i in range(len(docs))],
            colorscale="Viridis"
        ))
        fig.update_layout(
            title="Embeddings Similarity Matrix",
            xaxis_title="Documents",
            yaxis_title="Documents"
        )
        fig.show()

    def setup_embedding_demo(self):
        """Demonstrate embeddings and visualize similarity."""
        model = self.model
        docs = [
            "Functions in Python are blocks of reusable code.",
            "Error handling in Python uses try-except blocks.",
            "Good documentation includes function signatures."
        ]
        embeddings = model.encode(docs)

        # Visualize similarity matrix
        self.visualize_similarity_matrix(embeddings, docs)

        return model, docs, embeddings

# Example Usage and Demo

In [14]:
def run_demo():
    # Initialize RAG
    rag = BasicRAG()

    # Predefined example queries
    queries = [
        "How to document Python functions?",
        "Best practices for error handling",
        "Documentation style guidelines"
    ]

    # Run predefined queries
    for query in queries:
        print(f"\nQuery: {query}")
        print("=" * 50)

        # Visualize retrieval step
        print("\nStep 1: Visualizing Document Retrieval...")
        rag.visualize_retrieval(query)

        # Generate documentation step
        print("\nStep 2: Generating Documentation...")
        doc = rag.generate_documentation(query)
        print("\nGenerated Documentation:")
        print(doc)

        # Pause for the audience to absorb or move to the next query
        input("\nPress Enter to continue to the next query...")

    # Interactive queries from students
    print("\n--- Interactive Mode: Enter Your Own Queries ---")
    while True:
        user_query = input("Enter a query (or type 'exit' to stop): ")
        if user_query.lower() == "exit":
            print("Exiting the demo. Thank you!")
            break

        print(f"\nQuery: {user_query}")
        print("=" * 50)

        # Visualize retrieval step
        print("\nStep 1: Visualizing Document Retrieval...")
        rag.visualize_retrieval(user_query)

        # Generate documentation step
        print("\nStep 3: Generating Documentation...")
        doc = rag.generate_documentation(user_query)
        print("\nGenerated Documentation:")
        print(doc)

In [15]:
if __name__ == "__main__":
    # Initialize RAG instance
    rag = BasicRAG()

    # First demo embeddings (optional visualization of similarity matrix)
    print("\n--- Step 1: Demonstrating Embeddings ---")
    rag.setup_embedding_demo()

    # Then run RAG demo
    print("\n--- Step 2: Running RAG Demo ---")
    run_demo()



Knowledge Base Documents:
Document 1:
Content:
Functions in Python are blocks of reusable code.
Metadata: {'source': 'python_functions.md'}
--------------------------------------------------
Document 2:
Content:
Error handling in Python uses try-except blocks.
Metadata: {'source': 'error_handling.md'}
--------------------------------------------------
Document 3:
Content:
Good documentation includes function signatures.
Metadata: {'source': 'documentation_guidelines.md'}
--------------------------------------------------

--- Step 1: Demonstrating Embeddings ---



--- Step 2: Running RAG Demo ---




Knowledge Base Documents:
Document 1:
Content:
Functions in Python are blocks of reusable code.
Metadata: {'source': 'python_functions.md'}
--------------------------------------------------
Document 2:
Content:
Error handling in Python uses try-except blocks.
Metadata: {'source': 'error_handling.md'}
--------------------------------------------------
Document 3:
Content:
Good documentation includes function signatures.
Metadata: {'source': 'documentation_guidelines.md'}
--------------------------------------------------

Query: How to document Python functions?

Step 1: Visualizing Document Retrieval...



Query: How to document Python functions?

Retrieved Documents:
Document 1 - Relevance Score: 0.623
Content:
Functions in Python are blocks of reusable code.
--------------------------------------------------
Document 3 - Relevance Score: 0.524
Content:
Good documentation includes function signatures.
--------------------------------------------------

Step 2: Generating Documentation...

Prompt Sent to Claude:
You are a documentation expert. Generate documentation following the google style guide.

        Relevant context from similar documentation:
        Functions in Python are blocks of reusable code.
Good documentation includes function signatures.

        Query to document:
        How to document Python functions?
        

Generated Response:
Here's documentation on how to document Python functions following the Google style guide:

```python
"""How to Document Python Functions

This guide explains the proper way to document Python functions using Google-style
docstrings.

F

KeyboardInterrupt: Interrupted by user