# Learning rag from scratch

### basic implementation

In [2]:
import bs4
import os
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI,GoogleGenerativeAIEmbeddings

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
import getpass
import os

if not os.getenv("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google API key: ")

In [4]:
# Load Documents
loader = WebBaseLoader(
    web_paths=["https://www.startupindia.gov.in/content/sih/en/women_entrepreneurs.html","https://wep.gov.in/state-schemes"],
)
docs = loader.load()

In [5]:
docs[0].page_content # Preview first 500 characters of the document

"\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nWomen Entrepreneurship\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n▲\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                           Login\n                                        \n\n\n\n\n\n\n\n\n\n\n\n\nDashboard\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDashboard\n\n\nView\n                                                    Profile\n\n\nEdit Profile\n\n\n\n\n\n\nMy Connections\n\n\n\n                                                    Notifications\n                                                    \n\n\n\n\nSettings\n\n Privacy Settings\n\n\n\nChange Email ID\n\n\n                                                            Change Password\n Create\n                                                            Password\n\n\n\n Logout \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n  \n\n\n\n\n\n\n\n\n\n\n\n\n\n\nOur Toll Fre

In [6]:
len(docs)

2

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [8]:
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001"))
retriever = vectorstore.as_retriever()

In [9]:
prompt = ChatPromptTemplate.from_template(
    """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:"""
)

In [10]:
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite",temperature=0)

In [11]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [12]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    |prompt
    |llm
    |StrOutputParser()
)

rag_chain.invoke('which of the schemes available are for women?')

'The schemes available for women include the Mahila Shakti Kendra (MSK) Scheme, a micro-finance scheme for women with interest rebates, and a scheme for rural women artisans in coir fibre producing regions. Additionally, there are several schemes listed under Niti Aayog for women, such as the Skill Upgradation and Mahila Coir Yojana, Mahila Samriddhi Yojana, and Women Entrepreneurship Platform (WEP).'

## Let's start with query translation 


![image](images/download.png)

### trying multi retrivel search

In [13]:
# for multiretrieval results
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

generate_queries = (
    prompt_perspectives 
    | ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite",temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)
generate_queries.invoke({"question": "What are the initiatives for women entrepreneurs?"})

['What programs exist to support women starting businesses?',
 'Are there any grants or funding opportunities specifically for female entrepreneurs?',
 'What resources are available for women looking to launch or grow their ventures?',
 'Can you tell me about organizations or government bodies that champion women in entrepreneurship?',
 'What are the current efforts and projects aimed at empowering women entrepreneurs?']

In [14]:
from langchain_core.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve
question = "Which schemes are available forwomen in kerela?"
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})
len(docs)

  return [loads(doc) for doc in unique_docs]


18

In [15]:
docs[0].page_content

'Kerala\n\n\n Kerala Startup Mission (KSUM) supports women Startups with a soft \n                                    loan scheme for an amount limited to Rs.15 Lakhs as working capital \n                                    for implementing works and projects received from the Govt. \n                                    departments and Public Sector Undertakings in Kerala. For young (\n                                    18 to 45 years) Women and SC/ST entrepreneurs, the assistance is 20% up to INR 30 lakhs.\n                                    View State Startup Policies\n\n\n\n\n\n\n\nMaharashtra'

In [16]:
from operator import itemgetter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.runnables import RunnablePassthrough

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", temperature=0)

final_rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

'The Kerala Startup Mission (KSUM) supports women startups with a soft loan scheme. The loan amount is limited to Rs. 15 Lakhs as working capital for implementing works and projects received from government departments and public sector undertakings in Kerala. For young women (18 to 45 years) and SC/ST entrepreneurs, the assistance is 20% up to INR 30 lakhs.'

### Now lets try RAGfusion

In [17]:
template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)

In [18]:
generate_queries = (
    prompt_rag_fusion 
    | ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite",temperature=0) 
    | StrOutputParser() 
    | (lambda x: [q.strip() for q in x.split("\n") if q.strip()])
)
generate_queries.invoke({"question": "What are the initiatives for women entrepreneurs?"})

['Here are 4 search queries related to "What are the initiatives for women entrepreneurs?":',
 '1.  **"government programs supporting women entrepreneurs"**',
 '2.  **"grants and funding for female business owners"**',
 '3.  **"mentorship and networking opportunities for women in business"**',
 '4.  **"resources and training for aspiring women entrepreneurs"**']

In [20]:
def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
docs = retrieval_chain_rag_fusion.invoke({"question": "which schemes are for people in goa?"})
len(docs)

19

In [21]:
# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain_rag_fusion, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

'The Kerala Startup Mission (KSUM) offers a soft loan scheme for women startups, with a limit of Rs. 15 Lakhs for working capital. For young women (18-45 years) and SC/ST entrepreneurs, the assistance is 20% up to INR 30 lakhs.'

### Decomposition

In [22]:
template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. \n
Generate multiple search queries related to: {question} \n
Output (3 queries):"""
prompt_decomposition = ChatPromptTemplate.from_template(template)

In [23]:
generate_queries_decomposition = ( prompt_decomposition | llm | StrOutputParser() | (lambda x: [q.strip() for q in x.split("\n") if q.strip()]))

# Run
question = "what schemes are available for women entrepreneurs?"
questions = generate_queries_decomposition.invoke({"question":question})

In [24]:
questions

['Here are 3 search queries related to "what schemes are available for women entrepreneurs?":',
 '1.  "government schemes for women entrepreneurs India"',
 '2.  "financial assistance programs for female business owners"',
 '3.  "grants and loans for women starting businesses"']

In [25]:
# Prompt
template = """Here is the question you need to answer:

\n --- \n {question} \n --- \n

Here is any available background question + answer pairs:

\n --- \n {q_a_pairs} \n --- \n

Here is additional context relevant to the question: 

\n --- \n {context} \n --- \n

Use the above context and any background question + answer pairs to answer the question: \n {question}
"""

decomposition_prompt = ChatPromptTemplate.from_template(template)

In [26]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

def format_qa_pair(question, answer):
    """Format Q and A pair"""
    
    formatted_string = ""
    formatted_string += f"Question: {question}\nAnswer: {answer}\n\n"
    print(formatted_string)
    return formatted_string.strip()

q_a_pairs = ""
for q in questions:
    if not q or not q.strip():  # skip empty questions
        print(f"Skipping empty question")
        continue
    rag_chain = (
    {"context": itemgetter("question") | retriever, 
     "question": itemgetter("question"),
     "q_a_pairs": itemgetter("q_a_pairs")} 
    | decomposition_prompt
    | llm
    | StrOutputParser())

    answer = rag_chain.invoke({"question":q,"q_a_pairs":q_a_pairs})
    q_a_pair = format_qa_pair(q,answer)
    q_a_pairs = q_a_pairs + "\n---\n"+  q_a_pair

Question: Here are 3 search queries related to "what schemes are available for women entrepreneurs?":
Answer: Here are 3 search queries related to "what schemes are available for women entrepreneurs?":

1.  **"Government schemes for women entrepreneurs India"**
2.  **"Financial assistance for women-owned businesses"**
3.  **"Startup India schemes for women founders"**


Question: 1.  "government schemes for women entrepreneurs India"
Answer: The provided context mentions that the "Women Entrepreneurship Platform (WEP)" hosts information and services relevant to women entrepreneurs, including funding and financial assistance, incubation and acceleration, compliance and tax assistance, entrepreneur skilling and mentorship, and marketing assistance. It also lists several government schemes for women, such as the Skill Upgradation and Mahila Coir Yojana, Mahila Samriddhi Yojana, Trade Related Entrepreneurship Assistance and Development (TREAD), Support to Training and Employment Programme 

In [27]:
answer

'The Women Entrepreneurship Platform (WEP) provides information and services for women entrepreneurs, including funding and financial assistance. Several government schemes offer grants and loans for women starting businesses:\n\n*   **Government of India (GoI) Grant:** This grant is provided to Non-Governmental Organisations (NGOs) to promote entrepreneurship among women. The grant can be up to 30 percent of the total project cost, which NGOs can use for training, counseling, and marketing support for beneficiaries. The remaining 70 percent of the project cost is financed by a lending agency as a loan.\n\n*   **Self Employment Lending Schemes- Credit Line 1 - Mahila_Samridhi_Yojana:** This scheme offers a soft loan of up to 25% of the project cost, with a maximum of Rs. 2.5 lakhs per project, for women entrepreneurs to set up new projects in the tiny/small-scale sector or to rehabilitate sick SSI units. The loan is repayable within 10 years.\n\n*   **Kerala Startup Mission (KSUM):** S

- we can also get answers of each subqueries seperately. 

### Step back prompting

In [28]:
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
examples = [
    {
        "input": "Could the members of The Police perform lawful arrests?",
        "output": "what can the members of The Police do?",
    },
    {
        "input": "Jan Sindel’s was born in what country?",
        "output": "what is Jan Sindel’s personal history?",
    },
]
# We now transform these to example messages
example_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{input}"),
        ("ai", "{output}"),
    ]
)
few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples,
)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer. Here are a few examples:""",
        ),
        # Few shot examples
        few_shot_prompt,
        # New question
        ("user", "{question}"),
    ]
)

In [29]:
generate_queries_step_back = prompt | llm | StrOutputParser()
question = "which sector have the most schemes for startups?"
generate_queries_step_back.invoke({"question": question})

'What are the most popular industries for new businesses?'

In [30]:
from langchain_core.runnables import RunnableLambda

response_prompt_template = """You are an expert of world knowledge. I am going to ask you a question. Your response should be comprehensive and not contradicted with the following context if they are relevant. Otherwise, ignore them if they are not relevant.

# {normal_context}
# {step_back_context}

# Original Question: {question}
# Answer:"""
response_prompt = ChatPromptTemplate.from_template(response_prompt_template)

chain = (
    {
        # Retrieve context using the normal question
        "normal_context": RunnableLambda(lambda x: x["question"]) | retriever,
        # Retrieve context using the step-back question
        "step_back_context": generate_queries_step_back | retriever,
        # Pass on the question
        "question": lambda x: x["question"],
    }
    | response_prompt
    | llm
    | StrOutputParser()
)

chain.invoke({"question": question})

"Based on the provided documents, it's not possible to definitively state which sector has the most schemes for startups. The documents focus on women entrepreneurship and highlight various initiatives and schemes related to it, but they do not offer a comprehensive overview of schemes across all sectors.\n\nHowever, the documents do indicate that **women entrepreneurship** is a significant focus area, with various government ministries and departments offering support. The **Ministry of Science and Technology** and the **Ministry of Micro, Small and Medium Enterprises** are mentioned in relation to schemes for women entrepreneurs.\n\nThe **Women Entrepreneurship Platform (WEP)**, hosted by Niti Aayog, acts as an aggregator for information and services relevant to women entrepreneurs, covering areas like:\n*   Community and Networking\n*   Funding and Financial Assistance\n*   Incubation and Acceleration\n*   Compliance and Tax Assistance\n*   Entrepreneur Skilling and Mentorship\n*   

### Hyde(hypothetical doc embedding)

In [36]:
template = """Please write a scientific paper passage to answer the question
Question: {question}
Passage:"""
prompt_hyde = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser


generate_docs_for_retrieval = (
    prompt_hyde | llm | StrOutputParser() 
)

# Run
question = "what are the best schemes for women in rural areas?"
generate_docs_for_retrieval.invoke({"question":question})

'Here\'s a scientific paper passage addressing the question of "best schemes for women in rural areas," focusing on a nuanced and evidence-based approach rather than a definitive, singular answer.\n\n---\n\n**Passage:**\n\nThe identification of "best schemes" for women in rural areas necessitates a departure from a one-size-fits-all paradigm, acknowledging the heterogeneity of rural contexts and the diverse needs and aspirations of women within them. Research consistently demonstrates that effective interventions are those that are **contextually appropriate, participatory, and multi-faceted**, addressing interconnected challenges rather than isolated issues.\n\n**Economic empowerment** remains a cornerstone, with schemes focusing on skills development, access to credit and financial literacy, and market linkages proving particularly impactful. For instance, studies on agricultural extension programs tailored to women\'s specific roles and knowledge, coupled with access to microfinance

In [37]:
retrieval_chain = generate_docs_for_retrieval | retriever 
retrieved_docs = retrieval_chain.invoke({"question":question})
retrieved_docs

[Document(metadata={'title': 'Women Entrepreneurship', 'description': 'The increasing presence of women as entrepreneurs has led to significant business and economic growth in the country. Women-owned business enterprises are playing a prominent role in society by generating employment opportunities in the country, bringing in demographic shifts and inspiring the next generation of women founders.', 'source': 'https://www.startupindia.gov.in/content/sih/en/women_entrepreneurs.html', 'language': 'en'}, page_content='Women belonging to minority communities/ rural areas \n\n\n\n\nLINK\n\n\n\n\n\n\n\n\n\n\nMinistry\nDepartment\nBenefit of the Scheme\nEligibility Criteria\nWebsite\n\n\n\n\n\n\nMinistry of Women and Child Development\n\n\n\xa0\n\n\nThe Mahila Shakti Kendra (MSK) Scheme was approved in November,\n                                                2017 as a centrally sponsored scheme to empower rural women through\n                                                 community partic

In [38]:
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"context":retrieved_docs,"question":question})

'Based on the provided text, the following scheme is mentioned for women in rural areas:\n\n*   **Mahila Shakti Kendra (MSK) Scheme:** This is a centrally sponsored scheme approved in November 2017. Its purpose is to empower rural women through community participation and to provide them with an interface to approach the government for their entitlements. It also focuses on empowering them through training and capacity building.'