# Source: https://bishalbose294.medium.com/building-a-simple-rag-system-from-scratch-a-comprehensive-guide-6667af8ccb8c

# Component 1: PDF Loader

## First, we need a way to ingest PDF documents into our system

In [2]:
from langchain_community.document_loaders import PyMuPDFLoader

class PdfLoader:
  def __init__(self, ):
    pass

  def read_file(self, file_path):
    loader = PyMuPDFLoader(file_path)
    docs = loader.load()   # saves each page as a separate document
    return docs   
  pass

# Component 2: Text Chunking

## Once we’ve loaded our documents, we need to split them into smaller chunks for effective retrieval:

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

class Chunker:
  def __init__(self, chunk_size=1000, chunk_overlap=100):
    self.text_splitter = RecursiveCharacterTextSplitter(
      # Set a really small chunk size, just to show.
      separators=[
          "\n\n",
          "\n",
          " ",
          ".",
          ",",
          "\u200b",  # Zero-width space
          "\uff0c",  # Fullwidth comma
          "\u3001",  # Ideographic comma
          "\uff0e",  # Fullwidth full stop
          "\u3002",  # Ideographic full stop
          "",
      ],
      chunk_size=chunk_size,
      chunk_overlap=chunk_overlap,
      length_function=len,
      is_separator_regex=False,
  )
  pass
  def chunk_docs(self, docs):
    list_of_docs = []
    for doc in docs:
      tmp = self.text_splitter.split_text(doc.page_content)
      for chunk in tmp:
        list_of_docs.append(
            Document(
            page_content=chunk,
            metadata=doc.metadata,
          )
        )
    return list_of_docs

# Component 3: Vector Store

## Now that we have our document chunks, we need to convert them to vector embeddings and store them for efficient retrieval:

In [4]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings
from uuid import uuid4


class VectorStore:
  def __init__(self, ):
    self.embeddings = OllamaEmbeddings(model="llama3.2:3b")
    self.index = faiss.IndexFlatL2(len(self.embeddings.embed_query("hello world")))
    self.vector_store = FAISS(
        embedding_function=self.embeddings,
        index=self.index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
    )
    pass
  
  def add_docs(self, list_of_docs):
    uuids = [str(uuid4()) for _ in range(len(list_of_docs))]
    self.vector_store.add_documents(documents=list_of_docs, ids=uuids)
    pass

  def search_docs(self, query, k=5):
    results = self.vector_store.similarity_search(
        query,
        k=k,
    )
    return results
    pass
  pass

# Component 4: The RAG System

## Finally, we can put all these components together into a complete RAG system:

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from langchain_ollama import OllamaLLM

class RAG:

  def __init__(self, ):
    self.instructor_prompt = """Instruction: You're an expert problem solver you answer questions from context given below. 
    You strictly adhere to the context and never move away from it. 
    You're honest and if you do not find the answer to the question in the context you politely say "I don't know!"
    So help me answer the user question mentioned below with the help of the context provided
    User Question: {user_query}
    Answer Context: {answer_context}
    """
    self.prompt = PromptTemplate.from_template(self.instructor_prompt)
    self.llm = OllamaLLM(model="llama3.2:3b") #OpenAI()
    self.vectorStore = VectorStore()
    self.pdfloader = PdfLoader()
    self.chunker = Chunker()
    pass

  def run(self, filePaths, query):
    
    docs = []
    for filePath in filePaths:
      pdfdocs = self.pdfloader.read_file(filePath)
      docs += pdfdocs
    print(docs)

    list_of_docs = self.chunker.chunk_docs(docs)
    self.vectorStore.add_docs(list_of_docs)
    results = self.vectorStore.search_docs(query)
    answer_context = "\n\n"
    for res in results:
      answer_context = answer_context + "\n\n" + res.page_content
    chain = self.prompt | self.llm
    response = chain.invoke(
        {
            "user_query": query,
            "answer_context": answer_context,
        }
    )
    return response
  pass

if __name__ == "__main__":
  rag = RAG()
  filePaths=["BRIEF__How_to_Use_Data_to_Improve_Non-Degree_Workforce_Community_College_Programs.pdf",
             "EDC-Workforce-Systems-Best-Practices-Apprenticeships-2019.pdf",
             "GrowingEquityandDiversitythroughApprenticeship-BizPerspectives-07182019-2.pdf",
             "How to Design Effective Apprenticeship Programs to Build a Skilled Workforce A Guide for Program Sponsors.pdf"]
  with open("L_I_Apprenticeship_Apprentice_Details_20250608.csv", "r") as f:
      csv_content = f.read()
  query="What are the three most critical metrics for evaluating the success of apprenticeship programs? Please answer based on the context provided in the pdf files."
  response = rag.run(filePaths, query)
  print(response)

[Document(metadata={'producer': 'Skia/PDF m123 Google Docs Renderer', 'creator': '', 'creationdate': '2024-02-13T18:51:17-05:00', 'source': 'BRIEF__How_to_Use_Data_to_Improve_Non-Degree_Workforce_Community_College_Programs.pdf', 'file_path': 'BRIEF__How_to_Use_Data_to_Improve_Non-Degree_Workforce_Community_College_Programs.pdf', 'total_pages': 9, 'format': 'PDF 1.6', 'title': 'BRIEF: How to Use Data to Improve Non-Degree Workforce Community College Programs', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-02-14T12:25:05-05:00', 'trapped': '', 'modDate': "D:20240214122505-05'00'", 'creationDate': "D:20240213185117-05'00'", 'page': 0}, page_content='EDUCATION POLICY PROGRAM\nHow to Use Data To Improve Non-Degree \nWorkforce Programs at Community Colleges\nThe third in a three-part series from New America’s New Models for Career Preparation Project\nFebruary 2023\nIris Palmer and Shalin Jyotishi\nAbout the New Models of Career Preparation\nProject\nDespite the growing deman