## Data Ingestion & Chunking

In [1]:
from langchain_community.document_loaders import DirectoryLoader,TextLoader, PyPDFLoader
from langchain_community.document_loaders.powerpoint import UnstructuredPowerPointLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [26]:
# Define the directory containing documents 
directory_path = 'data/'

txt_loader = DirectoryLoader(
    path=directory_path,
    glob="**/*.txt",
    loader_cls=TextLoader
)


pdf_loader = DirectoryLoader(
    path=directory_path,
    glob="**/*.pdf",
    loader_cls=PyPDFLoader  
)


pptx_loader = DirectoryLoader( 
    path=directory_path,
    glob="**/*.pptx",
    loader_cls=UnstructuredPowerPointLoader
)


# Load documents from each loader 

txt_docs = txt_loader.load()
pdf_docs = pdf_loader.load()
pptx_docs =pptx_loader.load()


# Combine all documents into a single list
all_docs = txt_docs + pdf_docs + pptx_docs



In [28]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,        # each chunk will be ~1000 characters
    chunk_overlap=80       # 200 characters will overlap between chunks
)

In [29]:
docs = splitter.split_documents(all_docs)

In [5]:
from pathlib import Path
directory_path = 'data/'

In [7]:
def Text_Loader(dir:Path):
    # Define the directory containing documents 

    txt_loader = DirectoryLoader(
        path=directory_path,
        glob="**/*.txt",
        loader_cls=TextLoader
    )


    pdf_loader = DirectoryLoader(
        path=directory_path,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader  
    )


    pptx_loader = DirectoryLoader( 
        path=directory_path,
        glob="**/*.pptx",
        loader_cls=UnstructuredPowerPointLoader
    )


    # Load documents from each loader 

    txt_docs = txt_loader.load()
    pdf_docs = pdf_loader.load()
    pptx_docs =pptx_loader.load()


    # Combine all documents into a single list
    all_docs = txt_docs + pdf_docs + pptx_docs
    
    return all_docs



In [10]:
def doc_splitter(docs):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,        # each chunk will be ~1000 characters
        chunk_overlap=80       # 200 characters will overlap between chunks
    )

    docs = splitter.split_documents(docs)
    
    return docs

In [None]:
docs = Text_Loader(directory_path)
docs = doc_splitter(Text_Loader)

In [8]:
Text_Loader = Text_Loader(directory_path)

In [39]:
import os 
from dotenv import load_dotenv
load_dotenv()
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_groq import ChatGroq
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

In [43]:

groq_api_key = os.getenv("GROQ_API_KEY")

In [44]:
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [47]:
llm = ChatGroq(
    model_name="meta-llama/llama-4-scout-17b-16e-instruct",
    temperature=0,
    api_key="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"
)

In [48]:
from langchain.vectorstores import FAISS

# Assume you already have `embedding_model` and `docs` (a list of chunked Document objects)

# Create FAISS index
db = FAISS.from_documents(documents=docs, embedding=embedding_model)

# (Optional) Save the FAISS index locally
db.save_local("faiss_index")

# (Optional) Load it back later
# db = FAISS.load_local("faiss_index", embedding_model)


In [49]:
# FAISS version
db = FAISS.from_documents(documents=docs, embedding=embedding_model)


In [50]:
retriver = db.as_retriever(search_kwargs={'k': 5})

In [51]:
from langchain_core.prompts import ChatPromptTemplate

## PROMPT DESIGNING

In [52]:
prompt = ChatPromptTemplate.from_template(
    '''
    Your a an AI researcher who is an expert in RAG systems.
    Answer any question asked by the user.
    construct answers in the form of bullet points
    Craft your response only from the provided context only.
    If you cannot find any related information from the context, simply say no context provied.
    Do not hallucinate.
    
    <context>
    {context}
    </context>
    
    QUESTION:{question}
    '''
)

In [53]:

from langchain.chains.combine_documents import create_stuff_documents_chain
document_chain = create_stuff_documents_chain(llm, prompt) 

## GENERATION

In [54]:
user_prompt = input(str())
relevant_info = retriver.invoke(user_prompt)# Get relevent info from db and will be stuffed into the prompt as {context}.
response = document_chain.invoke({"context": relevant_info, "question": user_prompt})# feed related docs and user query to model.
print(response)

No context provided.


In [35]:
docs[10].page_content

'This process ensures that the generated text is not only contextually accurate but also grounded in factual information, making RAG particularly useful for applications such as question answering, document summarization, and legal assistant tools.\n\nAdvanced RAG Techniques'

In [14]:
documents = txt_loader.load()

In [15]:
for doc in documents:
    print(doc.page_content)     # Prints the text
    print(doc.metadata)         # Shows file info like 'source'


Large pre-trained language models have been shown to store factual knowledge in their parameters, and achieve state-of-the-art results when fine-tuned on downstream NLP tasks. However, their ability to access and precisely manipulate knowledge is still limited, and hence on knowledge-intensive tasks, their performance lags behind task-specific architectures. Additionally, providing provenance for their decisions and updating their world knowledge remain open research problems. Pre-trained models with a differentiable access mechanism to explicit non-parametric memory can overcome this issue, but have so far been only investigated for extractive downstream tasks. We explore a general-purpose fine-tuning recipe for retrieval-augmented generation (RAG) -- models which combine pre-trained parametric and non-parametric memory for language generation. We introduce RAG models where the parametric memory is a pre-trained seq2seq model and the non-parametric memory is a dense vector index of Wi