# Learning rag from scratch

### basic implementation

In [1]:
import bs4
import os
from langchain_community.document_loaders import WebBaseLoader,TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI,GoogleGenerativeAIEmbeddings

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
import getpass
import os

if not os.getenv("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google API key: ")

In [3]:
# Load Documents
# loader = WebBaseLoader(
#     web_paths=["https://www.startupindia.gov.in/content/sih/en/women_entrepreneurs.html","https://wep.gov.in/state-schemes"],
# )
# docs = loader.load()

In [4]:
loader = TextLoader(file_path="scraped_content.txt", encoding="utf-8")
docs = loader.load()

In [5]:
docs[0].page_content # Preview first 500 characters of the document

'URL: https://www.startupindia.gov.in//content/sih/en/BRICS.html\nBRICS is a pivotal grouping that has emerged as a powerful force in shaping the global economic landscape, dedicated to fostering cooperation in areas such as trade, investment, technology, and global governance. Initially comprising Brazil, Russia, India, China, and South Africa, the bloc expanded following the 2023 BRICS summit, which formally invited Egypt, Ethiopia, Iran, and the United Arab Emirates to join. In 2025, Indonesia became a full member, further enhancing the group’s global influence.\nToday, BRICS nations collectively represent approximately 3.3 billion people, accounting for over 40% of the world’s population. Their economies contribute an estimated 37.3% of global GDP, reflecting their significant economic weight. The grouping, boasting massive consumer markets and workforce populations, has emerged as a key engine of global economic expansion, underscoring its significant role in reshaping the interna

In [6]:
len(docs)

1

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [None]:
# vectorstore = Chroma.from_documents(documents=splits, 
#                                     embedding=GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")
#                                     ,persist_directory="./chroma_google_genai")
# retriever = vectorstore.as_retriever()

GoogleGenerativeAIError: Error embedding content: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. 
* Quota exceeded for metric: generativelanguage.googleapis.com/embed_content_free_tier_requests, limit: 0 [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, violations {
  quota_metric: "generativelanguage.googleapis.com/embed_content_free_tier_requests"
  quota_id: "EmbedContentRequestsPerMinutePerUserPerProjectPerModel-FreeTier"
}
]

In [10]:
from langchain_community.embeddings import OllamaEmbeddings

vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OllamaEmbeddings(model="embeddinggemma:300m")
                                    ,persist_directory="./chroma_google_genai")
retriever = vectorstore.as_retriever()

  embedding=OllamaEmbeddings(model="embeddinggemma:300m")


In [11]:
prompt = ChatPromptTemplate.from_template(
    """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:"""
)

In [12]:
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite",temperature=0)

In [13]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [14]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    |prompt
    |llm
    |StrOutputParser()
)

rag_chain.invoke('which of the schemes available are for women?')

'The schemes available for women are Skill Upgradation and Mahila Coir Yojana, Mahila Samriddhi Yojana, Women Entrepreneurship Platform (WEP), Trade Related Entrepreneurship Assistance and Development (TREAD), Support to Training and Employment Programme for Women, Mudra Yojana for Women/ Mahila Udhyami Yojana, Stand-Up India, Nai Roshni- Scheme for Leadership Development of Minority Women, Mahila Shakti Kendra, and Women Scientists Scheme. Additionally, some states and union territories have specific provisions for women entrepreneurs, such as earmarking funds for them or offering monthly allowances.'

## Let's start with query translation 


![image](images/download.png)

### trying multi retrivel search

In [15]:
# for multiretrieval results
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

generate_queries = (
    prompt_perspectives 
    | ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite",temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)
generate_queries.invoke({"question": "What are the initiatives for women entrepreneurs?"})

['What programs exist to support women starting businesses?',
 'Are there any grants or funding opportunities specifically for female entrepreneurs?',
 'What resources are available for women looking to launch or grow their ventures?',
 'Can you tell me about organizations or government bodies that champion women in entrepreneurship?',
 'What are the current efforts and projects aimed at empowering women entrepreneurs?']

In [25]:
from langchain_core.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve
question = "Which schemes are most beneficial for tech startups?"
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})
len(docs)

15

In [26]:
docs[0].page_content

'The facilitator shall submit the claim for payment of fees to the respective Head of Office of the Trade Marks Registry. A letter addressed to the Head of Office of the respective Trade Mark Office, giving details of the claimed fee for drafting of the application and his ID proof as a registered Trade Mark Agent, shall be submitted along with the invoice.\nDifferent investors use different criteria to judge an investment. The importance of these factors would vary depending on the stage of investment, sector of startup, management team, etc. Listed below are typical investment criteria used by investors:\n1. Market Landscape: Refers to the addressable market that the startup is catering to.\nFactors: Market size, obtainable market share, adoption rate, historical and forecasted growth rates, macroeconomic drivers, demand-supply.\n2. Scalability and Sustainability : Startups should showcase the potential upscale in the near future, a sustainable and stable business plan.'

In [27]:
from operator import itemgetter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.runnables import RunnablePassthrough

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", temperature=0)

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

'Based on the provided text, there isn\'t a direct answer to which specific schemes are *most* beneficial for tech startups. However, the text does highlight several areas that are relevant to tech startups and their growth:\n\n*   **Intellectual Property Rights (IPRs):** The text emphasizes the importance of protecting IPRs for startups to sustain themselves in a competitive world. The "Startups Intellectual Property Protection (SIPP)" scheme is mentioned as facilitating the protection of Patents, Trademarks, and Designs.\n*   **Funding and Investment:** Several sections discuss funding for startups, including:\n    *   **Venture Capital:** The text notes that venture capital has historically provided significant returns and that VCs help startups with raising subsequent rounds of funding, networking, and connections.\n    *   **Government Initiatives:** Programs like "Conecta Startup Brasil" and the "Centelha Program" in Brazil are mentioned as providing financial resources, mentorsh

### Now lets try RAGfusion

In [28]:
template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)

In [29]:
generate_queries = (
    prompt_rag_fusion 
    | ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite",temperature=0) 
    | StrOutputParser() 
    | (lambda x: [q.strip() for q in x.split("\n") if q.strip()])
)
generate_queries.invoke({"question": question})

['Here are 4 search queries related to "Which schemes are most beneficial for tech startups?":',
 '1.  "best government grants for tech startups"',
 '2.  "venture capital funding options for early-stage tech companies"',
 '3.  "startup accelerators and incubators with high success rates for tech"',
 '4.  "tax incentives for technology startups UK"']

In [32]:
def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
docs = retrieval_chain_rag_fusion.invoke({"question": "which schemes are for people in goa?"})
len(docs)

5

In [31]:
# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain_rag_fusion, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

'Based on the provided context, there isn\'t a direct answer to which schemes are *most* beneficial for tech startups. However, here\'s what can be inferred:\n\n*   **Income Tax Exemption:** The context mentions that eligible startups can be exempted from paying income tax for 3 consecutive financial years out of their first ten years since incorporation. This is a significant benefit for any startup, including tech startups. To be eligible, the startup needs to be DPIIT recognized.\n*   **Patent and Trademark Facilitation:** The context highlights that filing patents can be expensive and time-consuming. There are facilitators available to help startups with patent and trademark applications, aiming to reduce cost and time. This is particularly relevant for tech startups that often rely on intellectual property.\n*   **State-Specific Policies:** Several states offer various benefits. For example, Maharashtra offers an "Incentive Fund for Top-rated Startup," "Investment Fund," and reimb

### Decomposition

In [33]:
template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. \n
Generate multiple search queries related to: {question} \n
Output (3 queries):"""
prompt_decomposition = ChatPromptTemplate.from_template(template)

In [34]:
generate_queries_decomposition = ( prompt_decomposition | llm | StrOutputParser() | (lambda x: [q.strip() for q in x.split("\n") if q.strip()]))

# Run
# question = "what schemes are available for women entrepreneurs?"
questions = generate_queries_decomposition.invoke({"question":question})

In [35]:
questions

['Here are 3 search queries related to "Which schemes are most beneficial for tech startups?":',
 '1.  "Government grants and funding for early-stage tech startups"',
 '2.  "Venture capital and angel investor programs for technology companies"',
 '3.  "Incubator and accelerator programs with proven success for tech startups"']

In [36]:
# Prompt
template = """Here is the question you need to answer:

\n --- \n {question} \n --- \n

Here is any available background question + answer pairs:

\n --- \n {q_a_pairs} \n --- \n

Here is additional context relevant to the question: 

\n --- \n {context} \n --- \n

Use the above context and any background question + answer pairs to answer the question: \n {question}
"""

decomposition_prompt = ChatPromptTemplate.from_template(template)

In [37]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

def format_qa_pair(question, answer):
    """Format Q and A pair"""
    
    formatted_string = ""
    formatted_string += f"Question: {question}\nAnswer: {answer}\n\n"
    print(formatted_string)
    return formatted_string.strip()

q_a_pairs = ""
for q in questions:
    if not q or not q.strip():  # skip empty questions
        print(f"Skipping empty question")
        continue
    rag_chain = (
    {"context": itemgetter("question") | retriever, 
     "question": itemgetter("question"),
     "q_a_pairs": itemgetter("q_a_pairs")} 
    | decomposition_prompt
    | llm
    | StrOutputParser())

    answer = rag_chain.invoke({"question":q,"q_a_pairs":q_a_pairs})
    q_a_pair = format_qa_pair(q,answer)
    q_a_pairs = q_a_pairs + "\n---\n"+  q_a_pair

Question: Here are 3 search queries related to "Which schemes are most beneficial for tech startups?":
Answer: Here are 3 search queries related to "Which schemes are most beneficial for tech startups?":

1.  "Government schemes for technology startups India"
2.  "Startup India schemes for tech companies"
3.  "Funding and support programs for Indian tech startups"


Question: 1.  "Government grants and funding for early-stage tech startups"
Answer: Based on the provided context, here's information relevant to government grants and funding for early-stage tech startups:

*   **Startup India Website:** The `startupindia.gov.in` website is a key resource. It mentions the "Pre-Seed Stage" of a startup, where entrepreneurs have an idea and are working to bring it to life. At this stage, funding needs are usually small, and channels are often informal.
*   **Bootstrapping and Friends/Family:** For the pre-seed stage, bootstrapping (using personal savings and revenue) and seeking investment f

In [38]:
answer

'Based on the provided context, here\'s information relevant to incubator and accelerator programs with proven success for tech startups:\n\n*   **Startup India\'s Role:** Startup India is actively involved in connecting startups with opportunities. The context mentions:\n    *   "Events hosted by incubators and accelerators with a learning opportunity for the youth to be linked here." This indicates that Startup India facilitates access to events organized by these programs, which are designed for learning and development.\n\n*   **Examples of Incubators/Accelerators (Global Context):** While not explicitly stated as having "proven success" within the Indian context, the document mentions examples of active innovation agencies and incubators in Brazil, which is a significant startup ecosystem. These include:\n    *   Startup Brasil\n    *   Google For Startups Accelerator Brazil\n\n*   **Government Support for Incubators/Accelerators (Global Context):** The Brazilian government\'s ini

- we can also get answers of each subqueries seperately. 

### Step back prompting

In [39]:
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
examples = [
    {
        "input": "Could the members of The Police perform lawful arrests?",
        "output": "what can the members of The Police do?",
    },
    {
        "input": "Jan Sindel’s was born in what country?",
        "output": "what is Jan Sindel’s personal history?",
    },
]
# We now transform these to example messages
example_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{input}"),
        ("ai", "{output}"),
    ]
)
few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples,
)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer. Here are a few examples:""",
        ),
        # Few shot examples
        few_shot_prompt,
        # New question
        ("user", "{question}"),
    ]
)

In [40]:
generate_queries_step_back = prompt | llm | StrOutputParser()
# question = "which sector have the most schemes for startups?"
generate_queries_step_back.invoke({"question": question})

'What are the best ways to support tech startups?'

In [41]:
from langchain_core.runnables import RunnableLambda

response_prompt_template = """You are an expert of world knowledge. I am going to ask you a question. Your response should be comprehensive and not contradicted with the following context if they are relevant. Otherwise, ignore them if they are not relevant.

# {normal_context}
# {step_back_context}

# Original Question: {question}
# Answer:"""
response_prompt = ChatPromptTemplate.from_template(response_prompt_template)

chain = (
    {
        # Retrieve context using the normal question
        "normal_context": RunnableLambda(lambda x: x["question"]) | retriever,
        # Retrieve context using the step-back question
        "step_back_context": generate_queries_step_back | retriever,
        # Pass on the question
        "question": lambda x: x["question"],
    }
    | response_prompt
    | llm
    | StrOutputParser()
)

chain.invoke({"question": question})

'Based on the provided documents, here\'s a breakdown of schemes and benefits that could be particularly advantageous for tech startups:\n\n**General Benefits for Startups (including Tech):**\n\n*   **Reduced Regulatory Burden:** The Startup India initiative aims to reduce the regulatory burden on startups, allowing them to focus on their core business and keep compliance costs low. This is beneficial for all startups, including tech ones, as it frees up resources and time.\n*   **Patent Protection:** While not a direct scheme, the document highlights the importance of patenting innovative ideas for tech startups to gain a competitive edge. It acknowledges that filing patents can be expensive, implying that any support in this area would be valuable.\n*   **Pro-bono Services:** Startup India has partnered with various corporations and organizations to offer free services in categories like management and cloud credits. These pro-bono services can significantly accelerate growth for tec

### Hyde(hypothetical doc embedding)

In [42]:
template = """Please write a scientific paper passage to answer the question
Question: {question}
Passage:"""
prompt_hyde = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser


generate_docs_for_retrieval = (
    prompt_hyde | llm | StrOutputParser() 
)

# Run
# question = "what are the best schemes for women in rural areas?"
generate_docs_for_retrieval.invoke({"question":question})

"Here's a scientific paper passage addressing the question of beneficial schemes for tech startups, focusing on a structured and evidence-based approach:\n\n## 3. Analysis of Beneficial Schemes for Technology Startups\n\nThe successful launch and sustained growth of technology startups are critically dependent on access to appropriate financial and strategic support mechanisms. This section critically analyzes various schemes, evaluating their potential benefits and limitations for early-stage technology ventures. The efficacy of a scheme is assessed based on its ability to address key startup challenges, including capital acquisition, market validation, talent recruitment, and intellectual property protection.\n\n**3.1. Seed and Venture Capital Funding:**\n\nSeed and venture capital (VC) funding represent a cornerstone of startup development, particularly for technology-intensive businesses requiring significant upfront investment in research, development, and scaling. **Benefits** ar

In [43]:
retrieval_chain = generate_docs_for_retrieval | retriever 
retrieved_docs = retrieval_chain.invoke({"question":question})
retrieved_docs

[Document(metadata={'source': 'scraped_content.txt'}, page_content='2. Raising Funds: Investors are the best guides for the startup to raise subsequent rounds of funding on the basis of stage, maturity, sector focus, etc. and aid in networking and connection for the founders to pitch their business to other investors.\n3. Recruiting Talent: Sourcing high-quality and best-fit human capital is critical for startups, especially when it comes to recruiting senior executives to manage and drive business goals. VCs, with their extensive network, can help bridge the talent gap by recruiting the right set of people at the right time.\n4. Marketing: VCs assist with marketing strategy for your product/service.\n5. M and A Activity: VCs have their eyes and ears open to merger and acquisition opportunities in the local entrepreneurial ecosystem to enable greater value addition to the business through inorganic growth.'),
 Document(metadata={'source': 'scraped_content.txt'}, page_content='URL: http

In [44]:
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"context":retrieved_docs,"question":question})

'Based on the provided text, there is no information about specific schemes that are most beneficial for tech startups. The text discusses how Venture Capitalists (VCs) can help startups in various ways, such as raising funds, recruiting talent, marketing, and M&A activity. It also mentions the pre-seed stage of a startup and funding options like bootstrapping and friends and family. However, it does not detail any particular schemes.'