In [79]:
import faiss as fs
import os as os
from dotenv import load_dotenv
import requests
import asyncio
from fastapi import FastAPI, Request
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough, RunnableMap
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableMap

#### Stage 1 : Data Ingestion

In [81]:
# Load the pdf document via the PyPDFLoader since it is pdf file
# Loads all the pages in a document but loads one page at a time due to alazy_load()
loader = PyPDFLoader("corpus/jesc101.pdf")
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [84]:
print(pages[0].metadata)
# print(pages[0].page_content)

{'producer': 'GPL Ghostscript 8.15', 'creator': 'PageMaker 7.0', 'creationdate': '2017-12-20T12:15:25+00:00', 'author': 'dtpcell5', 'moddate': '2025-04-04T12:10:04+05:30', 'title': 'CHAP 1.pmd', 'source': 'corpus/jesc101.pdf', 'total_pages': 16, 'page': 0, 'page_label': '1'}


In [85]:
# Function to clean the text
def clean_text(text):
    text = text.replace("/square6", "")
    text = re.sub(r"Figure\s+\d+(\.\d+)?", "", text)
    text = re.sub(r"(Activity\s+\d+(\.\d+)?)+", "", text)
    text = re.sub(r'\n\s*\n', '\n', text)
    text = re.sub(r' +', ' ', text)
    return text.strip()

cleaned_docs = []
for page in pages:
    cleaned_content = clean_text(page.page_content)
    # Retain original metadata. Create a new Document with cleaned text and metadata
    cleaned_docs.append(Document(
        page_content=cleaned_content,
        metadata=page.metadata
    ))

#### Chunking the data

In [86]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)

In [89]:
text_chunks = text_splitter.split_documents(cleaned_docs)

#### Embedding the text_chunks and storing it in FAISS

In [90]:
embeddings = OpenAIEmbeddings(openai_api_type = openai_api_key)

In [91]:
vector_store = FAISS.from_documents(text_chunks, embeddings)

In [92]:
vector_store.save_local("my_faiss_vector_store")

In [93]:
retriever = FAISS.load_local("my_faiss_vector_store", embeddings, allow_dangerous_deserialization=True).as_retriever()


In [94]:
template = """You are a teaching assistant for students of some school and are responsible for quesition-answer taks.
If you are not aware of the answer, simply respond like I dont know the answer or similar. Use 10 sentences maximum to answer the question.
Question : {question}
Context : {context}
Answer : """


In [95]:
prompt = PromptTemplate.from_template(template)

In [96]:
output_parser = StrOutputParser()

In [98]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

In [100]:
# Full RAG chain that keeps track of source docs
full_chain = (
    RunnableMap({
        "question": RunnablePassthrough(),
        "context": lambda q: retriever.get_relevant_documents(q),
    }) |
    RunnableMap({
        "question": lambda x: x["question"],
        "context": lambda x: format_docs(x["context"]),
        "source_docs": lambda x: x["context"],
    }) |
    RunnableMap({
        "answer": prompt | llm_model | StrOutputParser(),
        "source_docs": lambda x: x["source_docs"],
    })
)


# Invoke
result = full_chain.invoke("what is redox")

# Print result
print("Answer:\n", result["answer"])
print("\nSources:")
for doc in result["source_docs"]:
    source = doc.metadata.get("source", "Unknown file")
    page = doc.metadata.get("page", "Unknown page")
    print(f"Document: {source}, Page: {int(page)+1 if isinstance(page, int) else '?'}")

Answer:
 Redox reactions are oxidation-reduction reactions that occur during a chemical reaction. In these reactions, one substance gains oxygen or loses hydrogen (oxidation) while the other substance loses oxygen or gains hydrogen (reduction). For example, in reaction (1.31), carbon is oxidized to CO and ZnO is reduced to Zn. In reaction (1.32), HCl is oxidized to Cl2 and MnO2 is reduced to MnCl2. When a substance gains oxygen during a reaction, it is oxidized, and when a substance loses oxygen during a reaction, it is reduced. The colour of copper sulphate solution changes when an iron nail is dipped in it because the iron displaces copper from copper sulphate, forming a brown coating of copper on the iron nail. This is an example of a redox reaction.

Sources:
Document: corpus/jesc101.pdf, Page: 12
Document: corpus/jesc101.pdf, Page: 13
Document: corpus/jesc101.pdf, Page: 12
Document: corpus/jesc101.pdf, Page: 12
