# RAG - Q&A Application

## Import the API key

In [2]:
import os
from dotenv import load_dotenv, find_dotenv 

load_dotenv(find_dotenv(), override=True)

API_OPENAI = os.getenv("API_OAI_KEY")

## Loadint PDF

### Creating a loading function

In [3]:
def load_document(file):
    import os 
    _, extension = os.path.splitext(file)
    if extension == ".pdf":
        from langchain.document_loaders import PyPDFLoader # import the `pypdf` module to make it work
        print(f"Loading {file}...")
        loader = PyPDFLoader(file)
    elif extension == ".docx":
        from langchain.document_loaders import Docx2txtLoader # import the `docx2txt` module to make it work
        print(f"Loading {file}...")
        loader = Docx2txtLoader(file)
    data = loader.load()
    print("Done ... ")
    return data

### Running the code

In [4]:
data = load_document(file="us_constitution.pdf")
print(f"You have {len(data)} pages in you data")

Loading us_constitution.pdf...
Done ... 
You have 41 pages in you data


In [5]:
data = load_document(file="the_great_gatsby.docx")
print(f"You have {len(data)} pages in you data")

Loading the_great_gatsby.docx...
Done ... 
You have 1 pages in you data


## External source, e.g., Wikipedia

In [6]:
# pip install wikipedia -q
def load_from_wikipedia(query, lang="en", load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data

In [7]:
data = load_from_wikipedia("GPT4")
print(data[0].page_content)

Generative Pre-trained Transformer 4 (GPT-4) is a multimodal large language model trained and created by OpenAI and the fourth in its series of GPT foundation models. It was launched on March 14, 2023, and made publicly available via the paid chatbot product ChatGPT Plus, via OpenAI's API, and via the free chatbot Microsoft Copilot.  As a transformer-based model, GPT-4 uses a paradigm where pre-training using both public data and "data licensed from third-party providers" is used to predict the next token. After this step, the model was then fine-tuned with reinforcement learning feedback from humans and AI for human alignment and policy compliance.: 2 
Observers reported that the iteration of ChatGPT using GPT-4 was an improvement on the previous iteration based on GPT-3.5, with the caveat that GPT-4 retains some of the problems with earlier revisions. GPT-4, equipped with vision capabilities (GPT-4V), is capable of taking images as input on ChatGPT. OpenAI has not revealed technical 

## Chunking the text

In [8]:
def chunk_data(data, chunk_size = 256):
	# for generic text, the one below is the recommended text splitter
	# by default: \n \\n and white space
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
	chunks = text_splitter.split_documents(data)
	return chunks

In [9]:
data = load_document(file="us_constitution.pdf")
print(f"You have {len(data)} pages in you data")

Loading us_constitution.pdf...
Done ... 
You have 41 pages in you data


In [10]:
chunks = chunk_data(data=data)
print(len(chunks))

224


## Embedding and Uploading to a Vector Database (Pinecone)

In [11]:
API_PINECONE = os.getenv("PINECONE_API_KEY")

In [12]:
# Create an index if does not exists, embedd the chunks and insert into the index. If exists, we load from the index.
def insert_or_fetch_embeddings(index_name, chunks):
	import pinecone
	from langchain_community.vectorstores import Pinecone
	from langchain_openai import OpenAIEmbeddings
	from pinecone import ServerlessSpec
	
	pc = pinecone.Pinecone(api_key=API_PINECONE)
	embeddings = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=1536, api_key = API_OPENAI)
	
	if index_name in pc.list_indexes().names():
		print(f"{index_name} already exists. Loading embeddings ... ", end="")
		vector_stores = Pinecone.from_existing_index(index_name, embeddings)
		print("ok")
	else:
		print(f"Creating index {index_name} and embeddings ... ", end="")
		pc.create_index(
			name = index_name,
			dimension=1536,
			metric = "cosine",
			spec= ServerlessSpec(
	        cloud="aws",
	        region="us-east-1"
	    	) 
		)
		vector_stores = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
		
		print("ok")
		return vector_stores

In [13]:
# Delete index in the free tier of Pinecone (only one available)
def delete_pinecone_index(index_name = "all"):
	import pinecone
	pc = pinecone.Pinecone(api_key = API_PINECONE)
	if index_name == "all":
		indexes = pc.list_indexes().names()
		print("Deleting all indexes....")
		for index in indexes:
			pc.delete_index(index)
		print("Ok")
	else:
		print(f"Deleting index {index_name}.....", end="")
		pc.delete_index(index_name)
		print("Ok")

In [14]:
delete_pinecone_index()

  from tqdm.autonotebook import tqdm


Deleting all indexes....
Ok


In [15]:
vector_store = insert_or_fetch_embeddings(index_name="ask-a-document", chunks=chunks)

Creating index ask-a-document and embeddings ... ok


## Asking and Getting Answers

In [16]:
def ask_and_get_answer(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI
    
    llm = ChatOpenAI(api_key=API_OPENAI, model="gpt-4", temperature=1)
    
    # Use the as_retriever() method to get the retriever from the vector store
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k":3})
    
    # Create the RetrievalQA chain
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    
    # Get the answer
    answer = chain.invoke(q)
    return answer




In [17]:
q = "What is the whole document about ?"
answer = ask_and_get_answer(vector_store, q)


In [18]:
answer["result"]

"I'm sorry, but there is no document provided for me to analyze. Could you please provide the document or information you're referring to?"

In [19]:
import time
i = 1
print("Write Quit or Exit to quit.")

while True:
	q = input(f"Question #{i}")
	i += 1
	if q.lower() in ["quit","exit"]:
		print("Quitting ... bye bye ")
		time.sleep(2)
		break
	answer = ask_and_get_answer(vector_store, q)
	print(f"\nAnswer: {answer["result"]}")
	print(f"\n {"-"*50}\n")

Write Quit or Exit to quit.

Answer: I'm sorry, but I can't assist you unless you provide a question or any context. How can I assist you today?

 --------------------------------------------------


Answer: The document is the United States Constitution. It outlines the principles, structures, and processes of the U.S. government, including the purposes of forming a union, establishing justice, ensuring domestic tranquility, providing for common defense, and promoting general welfare. It also mentions that the Constitution, U.S. laws, and treaties are the supreme law of the country.

 --------------------------------------------------

Quitting ... bye bye 
