RAG From: OpenAI Cookbook

In [None]:
!pip install pypdf # For loading PDF documents
!pip install langchain # A framework for building LLM applications
!pip install openai # For OpenAI embeddings and LLMs (if using)
!pip install chromadb # A simple, in-memory vector store (good for beginners)
!pip install tiktoken # For token counting (useful for chunking)

Collecting pypdf
  Downloading pypdf-5.8.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-5.8.0-py3-none-any.whl (309 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.7/309.7 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.8.0
Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.35.0-

In [57]:
from google.colab import files

uploaded = files.upload()
for filename in uploaded.keys():
    print(f'User uploaded file "{filename}" with length {len(uploaded[filename])} bytes')

# Assuming you upload a file named 'your_document.pdf'
pdf_path = 'oai-australia.pdf'

Saving oai-australia.pdf to oai-australia.pdf
User uploaded file "oai-australia.pdf" with length 410238 bytes


In [None]:
!pip install -U langchain-community -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader(pdf_path)
documents = loader.load()

In [58]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, # Max size of each chunk
    chunk_overlap=200 # Overlap between chunks to maintain context
)
chunks = text_splitter.split_documents(documents)

print(f"Number of chunks created: {len(chunks)}")
print(f"Example chunk content: {chunks[0].page_content[:200]}...") # Display first 200 chars of first chunk
print(f"Example chunk metadata: {chunks[0].metadata}") # Check metadata like page number

Number of chunks created: 57
Example chunk content: AI in Australia
OpenAI’s Economic Blueprint
July 2025...
Example chunk metadata: {'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 20.0 (Macintosh)', 'creationdate': '2025-06-25T12:07:27+08:00', 'moddate': '2025-06-25T12:07:28+08:00', 'trapped': '/False', 'source': 'rag_data/oai-australia.pdf', 'total_pages': 16, 'page': 0, 'page_label': '1'}


In [None]:
!pip install -U langchain-openai -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.6/70.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from openai import OpenAI # Ensure this is imported for client configuration
from google.colab import userdata

# Initialize the OpenAI client pointing to OpenRouter
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=userdata.get("OPENROUTER_API_KEY"),
)
from langchain_openai import OpenAIEmbeddings # Using langchain_openai for newer integrations


os.environ["OPENAI_API_BASE"] = "https://openrouter.ai/api/v1"
os.environ["OPENAI_API_KEY"] = userdata.get("OPENROUTER_API_KEY")

from langchain_openai import OpenAIEmbeddings

# Changed model name to a more current one. Verify this model is available on OpenRouter.ai

embeddings = OpenAIEmbeddings(model="intfloat/multilingual-e5-large")

print("Embedding model configured: intfloat/multilingual-e5-large via OpenRouter.")

Embedding model configured: intfloat/multilingual-e5-large via OpenRouter.


In [None]:
!pip install transformers sentence-transformers # Install necessary libraries for Hugging Face models

from langchain.embeddings import HuggingFaceEmbeddings

# Choose a suitable embedding model from Hugging Face.
# 'sentence-transformers/all-MiniLM-L6-v2' is a good, lightweight, and fast option.
# 'intfloat/multilingual-e5-large' is also good but larger.
model_name = "sentence-transformers/all-MiniLM-L6-v2" # or "intfloat/multilingual-e5-large"

embeddings = HuggingFaceEmbeddings(model_name=model_name)

print(f"Embedding model configured: {model_name} (local Hugging Face model).")

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

  embeddings = HuggingFaceEmbeddings(model_name=model_name)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model configured: sentence-transformers/all-MiniLM-L6-v2 (local Hugging Face model).


In [None]:
from langchain.vectorstores import Chroma

# Create a ChromaDB vector store from the document chunks and embeddings
vector_store = Chroma.from_documents(
    chunks, # 'chunks' should be available from your step 3 completion
    embeddings
)

print("PDF chunks successfully embedded and stored in ChromaDB.")

PDF chunks successfully embedded and stored in ChromaDB.


In [51]:
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5}
)

In [59]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    openai_api_base="https://openrouter.ai/api/v1",
    openai_api_key=userdata.get("OPENROUTER_API_KEY"),
    model_name="deepseek/deepseek-chat-v3-0324:free",
    temperature=0
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

query = "What is written about 'government' in this document and on which page?"
response = qa_chain.invoke({"query": query})

print(f"Query: {query}")
print(f"Answer: {response['result']}")
print("\nSource documents:")
for doc in response["source_documents"]:
    print(f"- Page {doc.metadata.get('page')}: {doc.page_content[:150]}...")

Query: What is written about 'government' in this document and on which page?
Answer: The document discusses the role of government in several contexts, particularly in relation to AI adoption and public service improvements. Here are the key points mentioned:

1. **AI in Public Services**:  
   - The document highlights how AI can improve public services, using the example of Minnesota’s Enterprise Translation Office leveraging ChatGPT Enterprise to enhance multilingual service delivery (specific page number not provided in the excerpt).  

2. **Policy Recommendations for Government**:  
   - **Digitising and releasing public datasets** in accessible formats (e.g., housing, energy, education).  
   - Implementing the **Data Governance Framework** under the Data Availability and Transparency Act.  
   - Reviewing **data classification rules** to enable AI applications in the public sector.  
   - Investing in **whole-of-government AI training** (APS uplift).  
   - Establishing a **pub

In [60]:
import os

# Define your data directory
data_dir = 'rag_data'

# Create the directory if it doesn't exist
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    print(f"Directory '{data_dir}' created.")
else:
    print(f"Directory '{data_dir}' already exists.")

# Example: Assuming you've uploaded some .txt and .pdf files into 'rag_data'
# E.g., rag_data/document1.pdf, rag_data/report.txt, rag_data/article.pdf, etc.

Directory 'rag_data' already exists.


In [61]:
import os
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define the directory containing your PDF and text files
data_dir = 'rag_data' # Make sure this matches the directory created in step 1

# Create a dictionary mapping file extensions to their respective loaders
loader_mapping = {
    ".pdf": PyPDFLoader,
    ".txt": TextLoader,
    # Add other loaders if you have other file types, e.g.,
    # ".docx": Docx2txtLoader,
    # ".csv": CSVLoader,
}

documents = []
print(f"Loading documents from '{data_dir}'...")

# Iterate through files in the directory and load them using the appropriate loader
for root, _, files in os.walk(data_dir):
    for file in files:
        file_path = os.path.join(root, file)
        _, file_extension = os.path.splitext(file)
        file_extension = file_extension.lower()

        # Check if the file extension is in the loader_mapping
        if file_extension in loader_mapping:
            loader_class = loader_mapping[file_extension]
            try:
                loader = loader_class(file_path)
                documents.extend(loader.load())
                print(f"Loaded {file_path}")
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
        else:
            print(f"Skipping unsupported file type: {file_path}")

print(f"Loaded {len(documents)} document pages/sections in total.")

# Check if any documents were loaded
if not documents:
    print("No documents were loaded. Please make sure there are files with specified extensions in the directory.")
else:
    # --- Split Documents into Chunks (Same as before, but applied to all loaded docs) ---
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    chunks = text_splitter.split_documents(documents)

    print(f"Created {len(chunks)} chunks from all documents.")
    # Access metadata safely, checking if chunks list is not empty
    if chunks:
        print(f"Example of a chunk's metadata (showing source and page for PDFs/TXTs): {chunks[0].metadata}")
    else:
        print("No chunks were created.")

Loading documents from 'rag_data'...
Loaded rag_data/normal.txt
Loaded rag_data/linkedin-unwrapped.pdf
Loaded 13 document pages/sections in total.
Created 1 chunks from all documents.
Example of a chunk's metadata (showing source and page for PDFs/TXTs): {'source': 'rag_data/normal.txt'}


In [64]:
# --- Test Queries ---

# Query 1: Information likely from a PDF file about LinkedIn (e.g., from a formal report)
query1 = "What are the key findings regarding text mentioned in the text file?"
response1 = qa_chain.invoke({"query": query1})
print(f"\n--- Query 1 ---")
print(f"Question: {query1}")
print(f"Answer: {response1['result']}")
print("\nSource documents:")
for i, doc in enumerate(response1["source_documents"]):
    source_info = doc.metadata.get('source', 'Unknown Source')
    page_info = doc.metadata.get('page', 'N/A')
    print(f"- Doc {i+1} (Source: {source_info}, Page: {page_info}): {doc.page_content[:200]}...")

# Query 2: Information likely from a text file (e.g., a simple article or notes)
query2 = "Can you summarize the material presented about random topic?"
response2 = qa_chain.invoke({"query": query2})
print(f"\n--- Query 2 ---")
print(f"Question: {query2}")
print(f"Answer: {response2['result']}")
print("\nSource documents:")
for i, doc in enumerate(response2["source_documents"]):
    source_info = doc.metadata.get('source', 'Unknown Source')
    page_info = doc.metadata.get('page', 'N/A')
    print(f"- Doc {i+1} (Source: {source_info}, Page: {page_info}): {doc.page_content[:200]}...")


--- Query 1 ---
Question: What are the key findings regarding text mentioned in the text file?
Answer: The key findings regarding the text in the document include:

1. **AI in Public Services**:  
   - AI, particularly OpenAI's tools like ChatGPT Enterprise, is being used to improve multilingual service delivery in government (e.g., Minnesota’s Enterprise Translation Office). This reduces turnaround times and enhances access for non-English-speaking communities while maintaining human oversight for accuracy.  

2. **Productivity Gains**:  
   - AI adoption has led to significant productivity improvements:  
     - 15% increase in worker productivity for customer-support agents.  
     - 40% of nursing tasks can be automated, freeing up time for patient care.  
     - Average reduction in time spent on writing tasks with an 18% increase in quality.  

3. **Healthcare Applications**:  
   - AI can automate administrative tasks (e.g., clinical note-writing) and support clinical decision-