<a href="https://colab.research.google.com/github/noumantechie/langchain/blob/main/retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Webbase RAG APPLICATION**

In [37]:
!pip install -qU beautifulsoup4 langchain-text-splitters langchain_huggingface "langchain-chroma>=0.1.2" langchain_google_genai langchain_core
!pip install



In [38]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser



In [39]:
# Load the PDF file
URL = "https://www.euclea-b-school.com/deepseek-ai-vs-open-ai-a-comprehensive-comparison/"
loader = WebBaseLoader(URL)


In [40]:
# Now use load_and_split()
pages = loader.load_and_split()

**Step 2 Chunking**

In [41]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=200,
    chunk_overlap=50,
)
chunks = text_splitter.split_documents(pages)

**Step 3 Generate Embeddings **

In [42]:
# Load an open-source Sentence Transformer model
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

**Step 4 Semantic Search and Storing into Database**

In [43]:
vectorstore = Chroma.from_documents(documents=chunks , embedding=embeddings)

In [44]:
retriever = vectorstore.as_retriever()

In [45]:
def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

In [46]:
from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    api_key=GOOGLE_API_KEY  # Corrected 'api_ket' to 'api_key'
)

In [47]:
template = """SYSTEM: You are a question-answering bot.
              Be factual in your responses.
              Respond to the following question only using the context provided below:
              Question: {question}
              Context: {context}
              If you don't know the answer, just say that you don't know.

              """


prompt = PromptTemplate.from_template(template)


In [48]:
chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [52]:
chain.invoke("Tell me the Strengths and Weaknesses?")

'OpenAI Strengths: Advanced NLP capabilities, versatile and widely applicable, strong community and ecosystem, extensive documentation and support.  OpenAI Weaknesses:  The provided text does not list any weaknesses beyond the heading "OpenAI Weaknesses:".'