<a href="https://colab.research.google.com/github/noumantechie/langchain/blob/main/retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**STEP 1 Loading Document**

In [None]:
!pip install -qU pypdf langchain-text-splitters langchain_huggingface "langchain-chroma>=0.1.2" langchain_google_genai langchain_core


In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [None]:
# Load the PDF file
pdf_path = "/content/General_Instructions_candiadtes.pdf"  # Ensure the correct file path
loader = PyPDFLoader(pdf_path)

In [None]:
# Now use load_and_split()
pages = loader.load_and_split()

**Step 2 Chunking**

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=200,
    chunk_overlap=50,
)
chunks = text_splitter.split_documents(pages)

**Step 3 Generate Embeddings **

In [None]:
# Load an open-source Sentence Transformer model
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

**Step 4 Semantic Search and Storing into Database**

In [None]:
vectorstore = Chroma.from_documents(documents=chunks , embedding=embeddings)

In [None]:
retriever = vectorstore.as_retriever()

In [None]:
def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

In [None]:
from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    api_key=GOOGLE_API_KEY  # Corrected 'api_ket' to 'api_key'
)

In [None]:
template = """SYSTEM: You are a question-answering bot.
              Be factual in your responses.
              Respond to the following question only using the context provided below:
              Question: {question}
              Context: {context}
              If you don't know the answer, just say that you don't know.

              """


prompt = PromptTemplate.from_template(template)


In [None]:
chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [35]:
chain.invoke("what is Testing & Interview Process ?")

'Based on the provided text, the testing and interview process involves a test or screening test, followed by an interview for eligible candidates.  Eligible candidates will be contacted and must bring original documents to their interview.'