In [5]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import re

import warnings
warnings.filterwarnings("ignore")

In [6]:
# Load pdfs from a directory
pdf_loader = PyPDFDirectoryLoader('Data')
pdfs = pdf_loader.load()
#pdfs

In [7]:
# Text Cleaning Function
for i, pdf in enumerate(pdfs):
    page = pdf.page_content
    page = re.sub(r'([\n]+)([0-9]+)', '', page)
    page = re.sub(r'([0-9]+) [.]', '', page)
    page = re.sub(r'([\n]+)', '', page)
    page = page.replace('•', '')
    page = re.sub(' +', ' ', page)
    page = page.replace("•", "")
    page = page.replace('"', "")
    if page[0].isdigit():
        page = page[1:]
    page = page.strip()
    pdfs[i].page_content = page
#pdfs

In [8]:
# Split the documents into smaller , managable chuncks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50
)
document_chunks = text_splitter.split_documents(pdfs)
print(f"The initial {len(pdfs)} documents were split into {len(document_chunks)} documents")

The initial 174 documents were split into 2605 documents


In [9]:
# import torch
# import transformers

# print(torch.__version__)  # Check torch version
# print(transformers.__version__)  # Check transformers version

In [12]:
from langchain_community.embeddings import OllamaEmbeddings
import pickle

# Initialize the embedding model
embeddings = OllamaEmbeddings(model="nomic-embed-text")

# Save the embedding model configuration
embedding_config = {
    "model": "nomic-embed-text",
}

# Save the configuration to a pickle file
with open('Data/embedding_config_nomic.pkl', 'wb') as f:
    pickle.dump(embedding_config, f)

print("Embedding configuration for nomic-embed-text saved.")


Embedding configuration for nomic-embed-text saved.


In [13]:
# from langchain_community.embeddings import HuggingFaceBgeEmbeddings
# import pickle


# embeddings = HuggingFaceBgeEmbeddings(
#     model_name="BAAI/bge-small-en-v1.5",
#     model_kwargs={'device': 'cpu'},
#     encode_kwargs={'normalize_embeddings': True}
# )

# # Save the embedding model configuration separately (e.g., in a pickle file)
# embedding_config = {
#     "model_name": "BAAI/bge-small-en-v1.5",
#     "model_kwargs": {'device': 'cpu'},
#     "encode_kwargs": {'normalize_embeddings': True}
# }
# with open('Data/embedding_config.pkl', 'wb') as f:
#     pickle.dump(embedding_config, f)

# print("FAISS index and embedding configuration saved.")

In [14]:
# Save the embeddings into Faiss index
from langchain_community.vectorstores import FAISS
db = FAISS.from_documents(document_chunks, embeddings)

db.index.ntotal 

2605

In [15]:
# Save the db to disk
db.save_local('Data/faiss_index')

In [16]:
# Load the db from disk
db = FAISS.load_local(
    folder_path='Data/faiss_index',
    embeddings=embeddings,
    allow_dangerous_deserialization=True
)
db.index.ntotal

2605

In [17]:
# query = "*"How do I start the powerstation?"™"
query = "do i get travel allownance?"
relevant_documents = db.similarity_search(query)

# for i in relevant_documents:
#     print(i)

In [18]:
# Use similarity searching algorithm and return 3 most relevant chunks.
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 3})
#retriever

In [19]:
# Setup an LLM for text generation 
# import ollama
# from langchain_ollama import OllamaLLM
from langchain_community.llms import Ollama
from langchain.chains.llm import LLMChain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import RetrievalQA
# from langchain_core.prompts import PromptTemplate

In [23]:
# Create a prompt template
prompt_template = """
You are a friendly senior Human Resource(HR) personnel. 
You will be given a question from an employee regarding their queries related to HR-Policies. 
Your task is to understand the employee's question first thoroughly, then based on the context provided to you, 
answer the employee in 3-4 concise sentences. 
If you don't know the answer to the employee's question, say "I don't know the answer to your question, 
Please contact the focal HR personnel in your department."

Context: {context}
Question: {question}
Your Helpful Answer:
"""

In [24]:
# Create a chat prompt template
chat_prompt = ChatPromptTemplate.from_template(Prompt_template)


In [25]:
# Choose the ollama model
model = Ollama(model="llama3.2")


In [26]:
# Create retrieval QA chain
retrievalQA = RetrievalQA. from_chain_type(
    llm=model,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": chat_prompt})

In [33]:
# Test the retrieval QA chain
user_query = "as an AVP executive what is my purchase price  entitlementfor vehicles for markup free loan?"
answer = retrievalQA. invoke({"query": user_query})

In [34]:
# Print the answer
print(answer['query'])
print(answer['result'])

pages = []
for i in answer['source_documents']:
    pages.append(i.metadata['page'])
pages.sort()
print("Further information in HR Manual: {}".format(pages))


as an AVP executive what is my purchase price  entitlementfor vehicles for markup free loan?
As an AVP Executive, your purchase price entitlement for a markup-free car loan would be fixed in January each year and listed under the "Grade Vehicle Entitlement" list. I don't know the exact figures, but it's mentioned that this information will be provided annually. You can check with our HR department or finance team to get the most up-to-date information on your entitlement. They should be able to provide you with more details on what vehicles are included in the list and how much of a loan is available.
Further information in HR Manual: [35, 35, 65]
