In [75]:
# !pip install -U langchain-community faiss-cpu langchain-huggingface pymupdf tiktoken langchain-ollama python-dotenv langchain

In [3]:
import os
import warnings
from dotenv import load_dotenv

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore")
load_dotenv()

True

### Document Loading

In [13]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader("./rag_dataset/military_docs/Deep_Reinforcement_Learning-Ba.pdf")

docs = loader.load()

In [14]:
doc = docs[0]
# print(doc.page_content)

In [18]:
import os

pdfs = []
for root, dirs, files in os.walk('rag_dataset'):
    # print(root, dirs, files)
    for file in files:
        if file.endswith('.pdf'):
            pdfs.append(os.path.join(root, file))

In [21]:
docs = []
for pdf in pdfs:
    loader = PyMuPDFLoader(pdf)
    pages = loader.load()
    docs.extend(pages)

In [23]:
len(docs)

73

### Document Chunking

In [24]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

chunks = text_splitter.split_documents(docs)

In [27]:
# document pages vs chunks
len(docs), len(chunks)

(73, 250)

In [41]:
# character count of first document page vs first chunk page
len(docs[0].page_content), len(chunks[0].page_content)

(4066, 926)

In [44]:
import tiktoken

encoding = tiktoken.encoding_for_model('gpt-4o-mini')

len(encoding.encode(docs[0].page_content)), len(encoding.encode(chunks[0].page_content))


(840, 242)

### Document Vector Embedding

In [48]:
from langchain_ollama import OllamaEmbeddings

import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [56]:
embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url="http://localhost:11434")

single_vector = embeddings.embed_query("rambutan is the best fruit!")

In [57]:
len(single_vector)

768

In [60]:
index = faiss.IndexFlatL2(len(single_vector))
index.ntotal, index.d

(0, 768)

In [62]:
vector_store = FAISS(
    embedding_function = embeddings,
    index = index,
    docstore = InMemoryDocstore(),
    index_to_docstore_id={}
)

In [64]:
len(chunks)

250

In [65]:
ids = vector_store.add_documents(documents=chunks)

In [67]:
vector_store.index_to_docstore_id
len(ids)

250

In [72]:
# store vector database
# db_name = 'military_info'
# vector_store.save_local(db_name)

# load the vector database
# new_vector_store = FAISS.load_local(db_name, embeddings=embeddings, allow_dangerous_deserialization=True)

### Retrieval

In [76]:
question = "How can VR be used in military training?"
docs = vector_store.search(query=question, search_type='similarity')

for doc in docs:
    print(doc.page_content)
    print("\n\n")

tests and according to the purpose of the GTS system – we dig deeper into 
the individual and sub-unit infantry training. Considering the current state of 
development, we present the training areas where the introduction of VR can 
be particularly beneficial.
Keywords: military training, VR, AR, simulation, immersivity
Introduction
The well-known fact that military developments bring breakthroughs in technology is still 
true, but some technologies are spreading and refining much faster in the civilian sector. 
An example of this is VR (virtual reality) technology. Extensive research was done in 
this area decades ago, but certain boundaries could not be crossed at that time. A major 
development started when the hardware and software of Oculus’s first VR devices became 
open source and the potential for entertainment was recognised. As consumer VR devices 
gained popularity, many companies started to develop headsets and software. Later, these



the capabilities of today’s VR techno

In [77]:
retriever = vector_store.as_retriever(search_type='mmr', search_kwargs={'k': 3, 'fetch_k': 100, 'lambda_mult': 1})

In [93]:
docs = retriever.invoke(question)
# for doc in docs:
#     print(doc.page_content)
#     print("\n\n")

question = "What is an intrusion detection system?"
# docs = retriever.invoke(question)
output = rag_chain.invoke(question)
print(output)

An intrusion detection system is a security tool that continuously monitors industrial control systems for signs of unauthorized access or malicious activity. It provides alerts when potential threats are detected, allowing engineers to respond promptly and prevent financial damage. Intrusion detection systems use various techniques, including rule-based approaches and deep learning, to improve accuracy.


### R.A.G w/ LLAMA 3.2 (3b params)

In [83]:
from langchain import hub # uploading, browsing, pulling & managing prompts
from langchain_core.output_parsers import StrOutputParser # gives final output as string data
from langchain_core.runnables import RunnablePassthrough # pass question & context directly to the model
from langchain_core.prompts import ChatPromptTemplate # pass prompt, context & question

from langchain_ollama import ChatOllama # makes connection from your model to langchain

In [85]:
model = ChatOllama(model="llama3.2", base_url="http://localhost:11434")

model.invoke("oi!")

AIMessage(content="Oi back at ya! How's it going? Is there something I can help you with or would you like to chat?", additional_kwargs={}, response_metadata={'model': 'llama3.2', 'created_at': '2024-12-30T08:07:29.7845378Z', 'done': True, 'done_reason': 'stop', 'total_duration': 640117500, 'load_duration': 18687600, 'prompt_eval_count': 27, 'prompt_eval_duration': 23000000, 'eval_count': 26, 'eval_duration': 597000000, 'message': Message(role='assistant', content='', images=None, tool_calls=None)}, id='run-f1683899-db7a-41fc-b0e6-3229547b6bf5-0', usage_metadata={'input_tokens': 27, 'output_tokens': 26, 'total_tokens': 53})

In [86]:
prompt = hub.pull("langchain-ai/rag-fusion-query-generation")

In [89]:
prompt = """ 
You are an assistant for question-answering tasks. 
Use the following retrieved information to answer the questions. 
If you don't know, just state that you don't know. Use three sentences maximum, keeping the answer concise.
Question: {question}
Context: {context}
Answer:
"""

prompt = ChatPromptTemplate.from_template(prompt)

In [90]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])
print(format_docs(docs))

Maroochy wastewater system attack, the Stuxnet worm attack, and the German steel plant
attack, have inflicted substantial financial damage on related facilities [3–5]. Recently, a
growing number of studies have been dedicated to developing intrusion detection systems
aimed at improving the security of industrial control systems [6–10]. Intrusion detection
of an attack, enabling engineers to respond promptly. The accuracy of intrusion detection
systems for securing industrial control systems has improved through the use of rule-based
approaches [2] and deep learning-based techniques [11].
Along with developing algorithms to improve the accuracy of intrusion detection
systems, research has also been conducted on methods for generating adversarial attacks
to evaluate the performance of intrusion detection systems [12–18]. Among the various

to evaluate the performance of intrusion detection systems [12–18]. Among the various
methods for adversarial attacks, this paper focuses on man-in-th

In [91]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)