# Simple RAG System

## Set up API access

In [None]:
import os
from dotenv import load_dotenv
from pathlib import Path
import sys

sys.path.append('rag')

In [None]:
# Load Groq API key from .env
load_dotenv()

## Load Documents into Vector Store

In [None]:
os.chdir('/Users/michelleli/Documents/rag/')
os.getcwd()

In [None]:
# Load documents
from langchain.document_loaders import PyPDFLoader
pdf_path = "data/sample_contract.pdf"
loader = PyPDFLoader(pdf_path)
documents = loader.load()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
chunks = text_splitter.split_documents(documents)

# Replace tabs with spaces
for chunk in chunks:
        chunk.page_content = chunk.page_content.replace('\t', ' ') 

In [None]:
# Create embeddings
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

# Create vector store
from langchain.vectorstores import FAISS
vectorstore = FAISS.from_documents(chunks, embeddings)

## Set up Retriever

In [None]:
# Use semantic search to retrieve top k chunks
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

## Set up LLM call

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

# LLM
llm = ChatOpenAI(model="gpt-5-mini") 
# Create custom prompt
custom_prompt = PromptTemplate(
    input_variables=["context", "question"], 
    template="""
You are an expert lawyer assistant helping answer questions about contracts. Please answer the question in a concise and understandable manner.
Context:
{context}

Question: 
{question}

Answer:
"""
)

# Create QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": custom_prompt},
    return_source_documents=True
)

## Example

In [None]:
# Ask a question
query = "What is the document about?"
result = qa_chain.invoke({"query": query})

print("Answer:", result["result"])