In [1]:
# needed to add this to suppress a warning during the embedding process below. Not part of the tutorial really...
from tqdm.auto import tqdm

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

## First change - Multiple PDFs

Leveling up from our basic single-file tutorial, we'll now attempt to handle multiple PDFs.

1. Let's create a function for reading PDFs (we know how to do this from our previous notebook)
2. Let's create a function to process a folder, retreiving the contents of pdfs in that folder

In [3]:
from typing import List, Dict
from pypdf import PdfReader

# Define a function to extract text from PDF files
def read_pdf(file_path: str) -> str:
    reader = PdfReader(file_path)
    pdf_text = ""
    for page in reader.pages:
        pdf_text += page.extract_text()
    return pdf_text

# Define a function to process a folder of pdfs
# The return is a dictionary where the keys are the file names and the values are the textual content
def process_pdf_folder(folder_path: str) -> Dict[str, str]:
    print("Processing PDFs in folder:", folder_path)
    pdf_contents = {}
    for filename in os.listdir(folder_path):
        print("Processing file:", filename)
        if filename.endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            pdf_contents[filename] = read_pdf(file_path) # Use our pdf reader here
    return pdf_contents



In [4]:
# Set up our folder path
FOLDER_PATH = r"/Users/michaeldownes/Downloads/muitiple-pdfs-rag-demo"

pdf_dictionary = process_pdf_folder(FOLDER_PATH)

# Let's take a peek at the dictory
# For each key-value pair in the dictionary, print the key and the length of the value
for key, value in pdf_dictionary.items():
    print(key, len(value))

Processing PDFs in folder: /Users/michaeldownes/Downloads/muitiple-pdfs-rag-demo
Processing file: equities.pdf
Processing file: labeling.pdf
Processing file: pro worker ai policy.pdf
Processing file: llms.pdf
Processing file: climate change paper.pdf
equities.pdf 35461
labeling.pdf 38714
pro worker ai policy.pdf 32328
llms.pdf 19956
climate change paper.pdf 22363


# Time to chunk again (simple)

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def chunk_documents(pdf_dictionary: Dict[str, str]) -> List[Dict[str, str]]:
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) # hardcoding this for now
    chunks = []
    for filename, content in pdf_dictionary.items(): # Pass in our dictionary of pdfs to content
        print(f"Chunking {filename}")
        document_chunks = text_splitter.split_text(content) # Split the pdf content
        print(f"Number of chunks: {len(document_chunks)}")
        chunks.extend([{"source": filename, "content": chunk} for chunk in document_chunks]) # Add the chunks to our list

    print(f"Total number of chunks across all documents: {len(chunks)}")
    return chunks

In [6]:
# lets try a chunking process to view some print statements
chunks = chunk_documents(pdf_dictionary)

Chunking equities.pdf
Number of chunks: 79
Chunking labeling.pdf
Number of chunks: 92
Chunking pro worker ai policy.pdf
Number of chunks: 73
Chunking llms.pdf
Number of chunks: 45
Chunking climate change paper.pdf
Number of chunks: 50
Total number of chunks across all documents: 339
equities.pdf 471
equities.pdf 467
equities.pdf 488


# Embed and store

In [7]:
import torch
from sentence_transformers import SentenceTransformer

def embed_chunks(chunks: List[Dict[str, str]]) -> List[Dict[str, str]]:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_name = "BAAI/bge-small-en-v1.5" # Hardcoding a model for now
    embedding_model = SentenceTransformer(model_name, device=device)
    
    texts = [chunk["content"] for chunk in chunks]
    embeddings = embedding_model.encode(texts, show_progress_bar=True)
    return embeddings

In [8]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance

def store_in_db(chunks: List[Dict[str, str]], embeddings: torch.Tensor):
    client = QdrantClient("http://localhost:6333")
    collection_name="qa_index" # using the same index from the basic example on purpose here

    # Delete collection if it already exists
    client.delete_collection(collection_name)

    # Create collection
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=embeddings.shape[1], distance=Distance.COSINE),
    )

    # Prepare payload and IDs
    payload = [{"content": chunk["content"], "source": chunk["source"]} for chunk in chunks]
    ids = list(range(len(chunks)))

    # Upload to Qdrant
    client.upload_collection(
        collection_name=collection_name,
        vectors=embeddings,
        payload=payload,
        ids=ids,
        batch_size=256
    )

    return client

In [9]:
embeddings = embed_chunks(chunks)
print(f"Created embeddsings of shape: {embeddings.shape}")

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Created embeddsings of shape: (339, 384)


In [10]:
client = store_in_db(chunks, embeddings)
print(f"Stored {client.count(collection_name='qa_index')} vectors in the collection")

Stored count=339 vectors in the collection


# Recap
- Folder files read
- Folder files added to dictionary (filename: content)
- Chunk list created (list of objects with keys `source` and `content`)
- Embeddings created (vectorize each chunk.content)
- Store embeddings with payloads

In [11]:
def search(text: str, top_k: int):
    # Because of scope differences, I copied the embedding_modle code here from the embed_chunks function
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_name = "BAAI/bge-small-en-v1.5" # Hardcoding a model for now
    embedding_model = SentenceTransformer(model_name, device=device)

    # Embed the query text
    query_embedding = embedding_model.encode(text).tolist() 

    search_result = client.search(
        collection_name='qa_index',
        query_vector=query_embedding,
        query_filter=None,
        limit=top_k
    )
    return search_result

In [15]:
from litellm import completion

# Retrieve

# Let's start with an easier question
question = "What has Astrazeneca done to reduce the effects of climate change?"
results = search(question, top_k=5)


# Generate
system_prompt = """You are an assistant for question-answering tasks. Answer the question according only to the given context.
If question cannot be answered using the context, simply say I don't know. Do not make stuff up.

Context: {context}
"""

user_prompt = """
Question: {question}

Answer:"""

# Our results in the last step include relevance scores, versions, payloads, and other stuff
# We'll just send the payload content (the human-readable chunk) to the LLM
references = [obj.payload["content"] for obj in results]

# I think this is just to separate each reference with some new lines to let the LLM know where one chunk starts & ends
context = "\n\n".join(references) 

response = completion(
  api_key=OPENAI_API_KEY,
  model="gpt-3.5-turbo",
  messages=[{"content": system_prompt.format(context=context),"role": "system"}, {"content": user_prompt.format(question=question),"role": "user"}]
)

print(f"ANSWER: {response.choices[0].message.content}\n\n")
print(f"REFERENCES:\n")
for index, ref in enumerate(references):
    print(f"Reference: [{index + 1}]: {ref}\n")

ANSWER: AstraZeneca installed three highly efficient heat pumps at their site in Gothenburg, Sweden, which has allowed them to drastically reduce the use of gas and electrify some of the site's steam demand. This initiative has the potential to replace over 60% of the site's gas consumption, contributing to greenhouse gas reduction efforts.


REFERENCES:

Reference: [1]: projects and continuing to work with key supply 
chain partners to help them achieve greenhouse gas 
reductions.ASTRAZENECA
As part of AstraZeneca’s GHG reduction program 
(validated for the first time by SBTi in 2016), Astra 
Zeneca’s site in Gothenburg, Sweden has installed 
three highly efficient heat pumps, which has 
allowed it to drastically reduce the use of gas. It 
electrifies some of the site’s steam demand and 
has the potential to replace over 60% of site gas 
consumption.

Reference: [2]: o AbbVie 2018 Responsible Action Report  and AbbVie - Prioritizing environmental sustainability
o AstraZeneca’s Ambitio

In [18]:
def ragOpenAI(question: str):
    results = search(question, top_k=5)
    references = [obj.payload["content"] for obj in results]
    
    system_prompt = """You are an assistant for question-answering tasks. Answer the question according only to the given context.
    If question cannot be answered using the context, simply say I don't know. Do not make stuff up.

    Context: {context}
    """

    user_prompt = """
    Question: {question}

    Answer:"""

    # Our results in the last step include relevance scores, versions, payloads, and other stuff
    # We'll just send the payload content (the human-readable chunk) to the LLM
    references = [obj.payload["content"] for obj in results]

    # I think this is just to separate each reference with some new lines to let the LLM know where one chunk starts & ends
    context = "\n\n".join(references) 

    response = completion(api_key=OPENAI_API_KEY, model="gpt-3.5-turbo", messages=[{"content": system_prompt.format(context=context),"role": "system"}, {"content": user_prompt.format(question=question),"role": "user"}])
    
    print(f"ANSWER: {response.choices[0].message.content}\n\n")
    print(f"REFERENCES:\n")
    for index, ref in enumerate(references):
        print(f"Reference: [{index + 1}]: {ref}\n")

In [19]:
# Let's try a more difficult question
question="How might we regulate LLMs?"
response = ragOpenAI(question)

ANSWER: There are at least two models for how regulation of LLMs might be carried out. One model involves a progressive tightening of standards, allowing "unsafe" models to be deployed with notifications to users, while the other model focuses on disclosing when LLMs are used in regulated domains and requiring the storage of model weights and parameters for investigations. Additionally, regulation should require risk assessment by model providers before release and could be done through existing domain-specific regulations.


REFERENCES:

Reference: [1]: with relatively minimal retraining of the model. Regulation should require developers to assess potential risks prior to deployment and establish liability for developers that distribute models that are used to cause foreseeable harm.  Part 3: Innovations that Could Improve Large Language Models  LLM technology is still in a state of flux. There are a number of potential technical innovations that could help mitigate the safety concern