In [6]:
import sys
import os

# Get the virtual environment's Python executable path
python_executable = sys.executable

# Download get-pip.py
!curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py

# Install pip using the virtual environment's Python
!{python_executable} get-pip.py

# Clean up
!rm get-pip.py

# Verify pip installation
!{python_executable} -m pip --version

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2222k  100 2222k    0     0  4923k      0 --:--:-- --:--:-- --:--:-- 4927k
Collecting pip
  Using cached pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Collecting wheel
  Downloading wheel-0.45.1-py3-none-any.whl.metadata (2.3 kB)
Using cached pip-24.3.1-py3-none-any.whl (1.8 MB)
Downloading wheel-0.45.1-py3-none-any.whl (72 kB)
Installing collected packages: wheel, pip
Successfully installed pip-24.3.1 wheel-0.45.1
pip 24.3.1 from /home/pkang/ai/aibootcamp/AIE5/02_Embeddings_and_RAG/.venv/lib/python3.11/site-packages/pip (python 3.11)


In [7]:
# Install PyPDF2
!{sys.executable} -m pip install PyPDF2

# Imports
from aimakerspace.text_utils import TextFileLoader, CharacterTextSplitter
from aimakerspace.vectordatabase import VectorDatabase
from aimakerspace.openai_utils.prompts import UserRolePrompt, SystemRolePrompt
from aimakerspace.openai_utils.chatmodel import ChatOpenAI
import asyncio
import PyPDF2
from typing import List
import os

# Enable async in Jupyter
import nest_asyncio
nest_asyncio.apply()

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [11]:
# Install required packages
!{sys.executable} -m pip install PyPDF2 reportlab

# Create a test PDF with some content
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter

def create_test_pdf(filename: str):
    """Create a test PDF file with sample content."""
    c = canvas.Canvas(filename, pagesize=letter)
    
    # Page 1
    c.drawString(100, 750, "Test PDF Document - Page 1")
    c.drawString(100, 700, "This is a sample PDF created for testing the RAG system.")
    c.drawString(100, 650, "The Michael Eisner problem is discussed in this document.")
    c.drawString(100, 600, "It relates to CEOs who hire weak executives in their former specialty.")
    
    # Page 2
    c.showPage()
    c.drawString(100, 750, "Test PDF Document - Page 2")
    c.drawString(100, 700, "More information about executive hiring:")
    c.drawString(100, 650, "1. Always hire strong executives")
    c.drawString(100, 600, "2. Don't micromanage unnecessarily")
    c.drawString(100, 550, "3. Focus on their strengths")
    
    c.save()
    print(f"Created test PDF: {filename}")

# Create the test PDF
create_test_pdf("data/test.pdf")

Collecting reportlab
  Downloading reportlab-4.2.5-py3-none-any.whl.metadata (1.5 kB)
Collecting chardet (from reportlab)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading reportlab-4.2.5-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading chardet-5.2.0-py3-none-any.whl (199 kB)
Installing collected packages: chardet, reportlab
Successfully installed chardet-5.2.0 reportlab-4.2.5
Created test PDF: data/test.pdf


In [12]:
class PDFLoader:
    """Loads PDF files and converts them to text documents."""
    
    def __init__(self, path: str):
        self.path = path
        self.documents = []
        
    def load_documents(self) -> List[str]:
        """Load PDF and convert to text documents."""
        if not os.path.exists(self.path):
            print(f"File not found: {self.path}")
            return []
            
        try:
            with open(self.path, 'rb') as file:
                # Create PDF reader object
                pdf_reader = PyPDF2.PdfReader(file)
                
                # Extract text from each page
                text = ""
                for i, page in enumerate(pdf_reader.pages, 1):
                    page_text = page.extract_text()
                    text += f"[Page {i}] {page_text}\n"
                
                self.documents.append(text)
                print(f"Successfully loaded PDF: {self.path}")
                print(f"Number of pages: {len(pdf_reader.pages)}")
                
            return self.documents
        except Exception as e:
            print(f"Error loading PDF {self.path}: {str(e)}")
            return []

In [13]:
# Load text documents
text_loader = TextFileLoader("data/PMarcaBlogs.txt")
text_documents = text_loader.load_documents()
print(f"Loaded {len(text_documents)} text documents")

# Load PDF documents
pdf_loader = PDFLoader("data/test.pdf")
pdf_documents = pdf_loader.load_documents()
print(f"Loaded {len(pdf_documents)} PDF documents")

# Combine all documents
documents = text_documents + pdf_documents

# Split documents into chunks
text_splitter = CharacterTextSplitter()
split_documents = text_splitter.split_texts(documents)
print(f"Created {len(split_documents)} chunks after splitting")

# Create and populate vector database
vector_db = VectorDatabase()
vector_db = asyncio.run(vector_db.abuild_from_list(split_documents))

Loaded 1 text documents
Successfully loaded PDF: data/test.pdf
Number of pages: 2
Loaded 1 PDF documents
Created 374 chunks after splitting


In [14]:
def test_rag_system(query: str, k: int = 3):
    """Test the RAG system with a query."""
    print(f"Query: {query}\n")
    results = vector_db.search_by_text(query, k=k)
    
    print(f"Top {k} relevant chunks:")
    for i, (text, score) in enumerate(results, 1):
        print(f"\n{i}. Relevance Score: {score:.4f}")
        # Check if the chunk is from PDF
        if "[Page" in text:
            print("Source: PDF Document")
        else:
            print("Source: Text Document")
        print(f"Text: {text[:200]}...")
        print("-" * 80)

# Test queries that should match both document types
test_queries = [
    "What is the Michael Eisner problem?",
    "What are the guidelines for hiring executives?",
    "How many pages are in the test PDF?",
]

for query in test_queries:
    test_rag_system(query)
    print("\n" + "="*100 + "\n")

Query: What is the Michael Eisner problem?

Top 3 relevant chunks:

1. Relevance Score: 0.5650
Source: PDF Document
Text: [Page 1] Test PDF Document - Page 1
This is a sample PDF created for testing the RAG system.
The Michael Eisner problem is discussed in this document.
It relates to CEOs who hire weak executives in th...
--------------------------------------------------------------------------------

2. Relevance Score: 0.5040
Source: Text Document
Text: ordingly.
Seventh, when hiring the executive to run your former specialty, be
careful you don’t hire someone weak on purpose.
This sounds silly, but you wouldn’t believe how oaen it happens.
The CEO w...
--------------------------------------------------------------------------------

3. Relevance Score: 0.4102
Source: Text Document
Text: ed?
In reality — as opposed to Marc’s warped view of reality — it will
be extremely helpful for Marc [if he were actually the CEO,
which he is not] to meet with the new head of engineering daily
w

In [15]:
from aimakerspace.openai_utils.prompts import (
    UserRolePrompt,
    SystemRolePrompt,
    AssistantRolePrompt,
)

from aimakerspace.openai_utils.chatmodel import ChatOpenAI

chat_openai = ChatOpenAI()
user_prompt_template = "{content}"
user_role_prompt = UserRolePrompt(user_prompt_template)
system_prompt_template = (
    "You are an expert in {expertise}, you always answer in a kind way."
)
system_role_prompt = SystemRolePrompt(system_prompt_template)

messages = [
    system_role_prompt.create_message(expertise="Python"),
    user_role_prompt.create_message(
        content="What is the best way to write a loop?"
    ),
]

response = chat_openai.run(messages)

In [16]:
RAG_PROMPT_TEMPLATE = """ \
Use the provided context to answer the user's query.

You may not answer the user's query unless there is specific context in the following text.

If you do not know the answer, or cannot answer, please respond with "I don't know".
"""

rag_prompt = SystemRolePrompt(RAG_PROMPT_TEMPLATE)

USER_PROMPT_TEMPLATE = """ \
Context:
{context}

User Query:
{user_query}
"""


user_prompt = UserRolePrompt(USER_PROMPT_TEMPLATE)

class RetrievalAugmentedQAPipeline:
    def __init__(self, llm: ChatOpenAI(), vector_db_retriever: VectorDatabase) -> None:
        self.llm = llm
        self.vector_db_retriever = vector_db_retriever

    def run_pipeline(self, user_query: str) -> str:
    # 1. Retrieve relevant context
    context_list = self.vector_db_retriever.search_by_text(user_query, k=4)

    # 2. Format the context
    context_prompt = ""
    for context in context_list:
        context_prompt += context[0] + "\n"

    # 3. Create prompts
    formatted_system_prompt = rag_prompt.create_message()
    formatted_user_prompt = user_prompt.create_message(
        user_query=user_query, 
        context=context_prompt
    )

    # 4. Return response and context
    return {
        "response": self.llm.run([formatted_system_prompt, formatted_user_prompt]), 
        "context": context_list
    }

In [17]:
retrieval_augmented_qa_pipeline = RetrievalAugmentedQAPipeline(
    vector_db_retriever=vector_db,
    llm=chat_openai
)

In [18]:
retrieval_augmented_qa_pipeline.run_pipeline("What is the 'Michael Eisner Memorial Weak Executive Problem'?")

{'response': "The 'Michael Eisner Memorial Weak Executive Problem' refers to the tendency of a CEO or founder to hire weak executives in the area where they themselves excel, in order to maintain control and relevance in that function. This phenomenon occurs when a CEO, who has a strong background in a specific area (like product management, sales, or marketing), hires a less capable individual to lead that same function, enabling the CEO to continue positioning themselves as the key authority. The context illustrates this with the example of Michael Eisner, the former CEO of Disney, who struggled with leading ABC after acquiring it, emphasizing the risks of such hiring practices.",
 'context': [('ordingly.\nSeventh, when hiring the executive to run your former specialty, be\ncareful you don’t hire someone weak on purpose.\nThis sounds silly, but you wouldn’t believe how oaen it happens.\nThe CEO who used to be a product manager who has a weak\nproduct management executive. The CEO who