In [None]:
%pip install -U langchain-community faiss-cpu langchain-huggingface pymupdf tiktoken langchain-ollama python-dotenv langchain

In [None]:
import os
import warnings
from dotenv import load_dotenv

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore")
load_dotenv()

### Document Loading

In [8]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader("./rag_dataset/military_docs/Deep_Reinforcement_Learning-Ba.pdf")

docs = loader.load()

In [9]:
doc = docs[0]
# print(doc.page_content)

In [10]:
import os

pdfs = []
for root, dirs, files in os.walk('rag_dataset'):
    # print(root, dirs, files)
    for file in files:
        if file.endswith('.pdf'):
            pdfs.append(os.path.join(root, file))

In [11]:
docs = []
for pdf in pdfs:
    loader = PyMuPDFLoader(pdf)
    pages = loader.load()
    docs.extend(pages)

In [None]:
len(docs)

### Document Chunking

In [13]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

chunks = text_splitter.split_documents(docs)

In [None]:
# document pages vs chunks
len(docs), len(chunks)

In [None]:
# character count of first document page vs first chunk page
len(docs[0].page_content), len(chunks[0].page_content)

In [None]:
import tiktoken

encoding = tiktoken.encoding_for_model('gpt-4o-mini')

len(encoding.encode(docs[0].page_content)), len(encoding.encode(chunks[0].page_content))


### Document Vector Embedding

In [17]:
from langchain_ollama import OllamaEmbeddings

import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [18]:
embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url="http://localhost:11434")

single_vector = embeddings.embed_query("rambutan is the best fruit!")

In [None]:
len(single_vector)

In [None]:
index = faiss.IndexFlatL2(len(single_vector))
index.ntotal, index.d

In [21]:
vector_store = FAISS(
    embedding_function = embeddings,
    index = index,
    docstore = InMemoryDocstore(),
    index_to_docstore_id={}
)

In [None]:
len(chunks)

In [23]:
ids = vector_store.add_documents(documents=chunks)

In [None]:
vector_store.index_to_docstore_id
len(ids)

In [72]:
# store vector database
# db_name = 'military_info'
# vector_store.save_local(db_name)

# load the vector database
# new_vector_store = FAISS.load_local(db_name, embeddings=embeddings, allow_dangerous_deserialization=True)

### Retrieval

In [None]:
question = "How can VR be used in military training?"
docs = vector_store.search(query=question, search_type='similarity')

for doc in docs:
    print(doc.page_content)
    print("\n\n")

In [26]:
retriever = vector_store.as_retriever(search_type='mmr', search_kwargs={'k': 3, 'fetch_k': 100, 'lambda_mult': 1})

In [None]:
docs = retriever.invoke(question)
# for doc in docs:
#     print(doc.page_content)
#     print("\n\n")

question = "Summarize the conclusion of the VR_Training_Opportunities_in_t.pdf document"
docs = retriever.invoke(question)
output = rag_chain.invoke(question)
print(output)

### R.A.G w/ LLAMA 3.2 (3b params)

In [30]:
from langchain import hub # uploading, browsing, pulling & managing prompts
from langchain_core.output_parsers import StrOutputParser # gives final output as string data
from langchain_core.runnables import RunnablePassthrough # pass question & context directly to the model
from langchain_core.prompts import ChatPromptTemplate # pass prompt, context & question

from langchain_ollama import ChatOllama # makes connection from your model to langchain

In [None]:
model = ChatOllama(model="llama3.2", base_url="http://localhost:11434")

model.invoke("oi!")

In [86]:
# prompt = hub.pull("langchain-ai/rag-fusion-query-generation")

In [32]:
prompt = """ 
You are an assistant for question-answering tasks. 
Use the following retrieved information to answer the questions. 
If you don't know, just state that you don't know. Use three sentences maximum, keeping the answer concise.
Question: {question}
Context: {context}
Answer:
"""

prompt = ChatPromptTemplate.from_template(prompt)

In [None]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])
print(format_docs(docs))

In [34]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)