# RAG

## extract pdf

In [2]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    pdf_reader = PyPDF2.PdfReader(pdf_path)
    text = ""
    # for page_num in range(len(pdf_reader.pages)):
    for page_num in range(10):
        page = pdf_reader.pages[page_num]
        text += page.extract_text()
    return text

pdf_path = r'C:\Users\HP\OneDrive\14Code\GPT\docs\app\ConceptsofBiology-WEB.pdf'
document_text = extract_text_from_pdf(pdf_path)


## chunk text

In [3]:
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

chunks = chunk_text(document_text)


## Environment Variable

In [4]:
pdf_path_book = r'C:\Users\HP\OneDrive\14Code\GPT\docs\app\ConceptsofBiology-WEB.pdf'

## Error Code

In [None]:
import os
import PyPDF2
from sentence_transformers import SentenceTransformer
import faiss
from langchain.llms import OpenAI
from langchain.chains import VectorDBQA
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

# # Load OpenAI API key from environment variable
# openai_api_key = os.getenv('OPENAI_API_KEY')
# if not openai_api_key:
#     raise ValueError("Please set the OPENAI_API_KEY environment variable.")

def extract_text_from_pdf(pdf_path):
    pdf_reader = PyPDF2.PdfReader(pdf_path)
    text = ""
    # for page_num in range(len(pdf_reader.pages)):
    for page_num in range(10):
        page = pdf_reader.pages[page_num]
        text += page.extract_text()
    return text

def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

# Extract and chunk text
pdf_path = pdf_path_book
document_text = extract_text_from_pdf(pdf_path)
chunks = chunk_text(document_text)

# Embed the chunks using Sentence-Transformers
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(chunks)

# Create a FAISS index
d = embeddings.shape[1]  # dimension
index = faiss.IndexFlatL2(d)
index.add(embeddings)

# Wrap the FAISS index using LangChain's FAISS vector store
vector_store = FAISS.from_embeddings(chunks, embeddings)

# Set up LangChain with OpenAI GPT-4
llm = OpenAI(model_name=OPENAI_DEPLOYMENT, api_key=OPENAI_API_KEY)

# Create a LangChain QA chain
qa_chain = VectorDBQA(llm=llm, vectorstore=vector_store)

# Function to answer queries
def answer_query(query):
    response = qa_chain({"query": query})
    return response["answer"]

# Example query
query = "What is the main topic of the document?"
answer = answer_query(query)
print(answer)


## Langchain RAG

In [38]:
# PDF Loaders. If unstructured gives you a hard time, try PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

from langchain.vectorstores import Chroma, Pinecone
# from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
import pinecone

# from langchain.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain

from langchain.chat_models import ChatOpenAI
from langchain_openai import AzureChatOpenAI
from langchain.chains import ConversationalRetrievalChain

In [None]:

loader = PyPDFLoader(pdf_path_book)


In [None]:
data = loader.load()
# Note: If you're using PyPDFLoader then it will split by page for you already
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your sample document')
print (f'Here is a sample: {data[40].page_content[400:1000]}')
# We'll split our data into chunks around 500 characters each with a 50 character overlap. These are relatively small.


In [None]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(data)
# Let's see how many small chunks we have
print (f'Now you have {len(texts)} documents')


In [None]:

# embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
# # load it into Chroma
# vectorstore = Chroma.from_documents(texts, embeddings)

In [None]:

from langchain.vectorstores import Chroma, Pinecone
# from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
import pinecone
# embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
# # load it into Chroma
# vectorstore = Chroma.from_documents(texts, embeddings)

In [54]:
embedder = HuggingFaceEmbeddings(
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
)
vectorstore = Chroma.from_documents(
    documents=texts, 
    embedding=embedder
    # persist_directory=CHROMA_PATH
)
# db.persist()

  warn_deprecated(


In [67]:
query = """how conflicting motivators function for a one-time behavior?"""
docs = vectorstore.similarity_search(query)

In [None]:
# Here's an example of the first document that was returned
for doc in docs:
    print (f"{doc.page_content}\n")

In [86]:
query = "how conflicting motivators function for a one-time behavior?"
docs = vectorstore.similarity_search(query)

In [93]:
from langchain.chat_models import ChatOpenAI
from langchain_openai import AzureChatOpenAI
from langchain.chains import ConversationalRetrievalChain

vectorstore=FAISS.from_documents(texts,embedding=embedder)


In [94]:

retriever=vectorstore.as_retriever() 
retriever.search_kwargs["distance_metric"]='cos'
retriever.search_kwargs["fetch_k"]=100
retriever.search_kwargs["maximal_marginal_relevance"]=True
retriever.search_kwargs["k"]=20

model =AzureChatOpenAI(azure_deployment="",
    openai_api_version="",
    model="",
    openai_api_key="",
    azure_endpoint="")
qa=ConversationalRetrievalChain.from_llm(model,retriever=retriever)

In [95]:
question="how conflicting motivators function for a one-time behavior?"
chat_history=[]
result=qa({"question":question,"chat_history":chat_history})
print(result)

{'question': 'how conflicting motivators function for a one-time behavior?', 'chat_history': [], 'answer': 'Conflicting motivators can drive a person towards different behaviors even for a one-time behavior. For example, at a company party, a person may have conflicting motivations to take a rest and enjoy their newly scrubbed house, but they may also want to tackle the backyard and cross that project off their list. These competing motivations can push a person towards different behaviors.'}
