### Using Recursive Character Splitter

In [13]:
import tiktoken
from langchain_community.document_loaders import Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI


model_default = "gpt-4o"

def count_tokens(text):
    encoding = tiktoken.encoding_for_model(model_default)
    return len(encoding.encode(text))

def count_tokens_from_message_rough(messages):
    encoding = tiktoken.encoding_for_model(model_default)
    value = ' '.join([x.get('content') for x in messages])
    return len(encoding.encode(value))

loader = Docx2txtLoader("./data/DRAFT 1_Maternity Leave Information.docx")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=500,
    chunk_overlap=50,
    length_function=count_tokens
)

splitted_documents = text_splitter.split_documents(docs)

embeddings_model = OpenAIEmbeddings(model='text-embedding-3-small')

db = Chroma.from_documents(splitted_documents, embeddings_model, persist_directory="./chroma_db")

qa_chain = RetrievalQA.from_chain_type(
    ChatOpenAI(model=model_default),
    retriever=db.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True
)

# help(ChatOpenAI)

response = qa_chain.invoke("Am I eligible for maternity leave if I am not a citizen?")
response


{'query': 'Am I eligible for maternity leave if I am not a citizen?',
 'result': "The provided context does not specify eligibility criteria for maternity leave for non-citizens. It mainly addresses the conditions and procedures for Singapore Citizen children and their mothers. To determine your eligibility, it would be best to consult your HR Business Partner or refer to your organization's specific maternity leave policy.",
 'source_documents': [Document(page_content='Officers whose NPL end after the first 8 weeks from the date of the birth of the child would be granted the remainder of the 4 or 8 weeks of EML, to be taken within 12 months from the date of the birth of the child. Any remaining EML not taken within this period will be forfeited.\n\n\n\n\n\nProcedure\n\nOfficers are to inform their HR Business Partner of their EDD and the planned start date of your ML (can be up to 4 weeks before EDD or upon delivery, please see diagram below).\n\nSubmit to HR Business Partner their Do

In [23]:
# print(splitted_documents[0].page_content)

### Using SemanticChunker

In [16]:
docs

[Document(page_content='\u200b\u200b\u200bMaternity Leave is granted to female officers to look after their new-born child(ren).\n\n\n\nFor this section, "confinement" means the delivery of a child, and "the organisation" denotes Enterprise Singapore (IE and SPRING before 1 April 2018).\n\n\n\n1.  All female officers (single/married) will be eligible for Maternity Leave (ML), if:\n\nShe has been working with the organisation for a continuous duration of at least 3 months prior to the date of confinement.\n\nShe is still in service*; and\n\nThe child is born on/after 1 January 2017\n\n\n\n* An officer who is on maternity leave may tender her resignation and serve out her notice of resignation concurrently with maternity leave. The maternity leave will cease after the last day of service. \n\n\n\n Eligibility \n\nIf the child is a Singapore Citizen at the time of birth:\n\n The female officer will be eligible for 16 weeks of full pay ML (first 8 weeks must be taken consecutively) for all

In [22]:
from langchain_experimental.text_splitter import SemanticChunker  
from langchain_openai.embeddings import OpenAIEmbeddings

# That's it. It is this simple.
text_splitter = SemanticChunker(OpenAIEmbeddings())

# Spit Text
splitted_documents = text_splitter.split_documents(docs)  
# print(splitted_documents[0].page_content)

In [25]:
# This is the Core Part of the Code
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI

question = "What are the approaches to Task Decomposition?"
llm = ChatOpenAI(temperature=0)
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=db.as_retriever(), llm=llm
)

In [27]:
# Set logging for the queries
import logging

# Refer to LangChain documentation to find which loggers to set
# Different LangChain Classes/Modules have different loggers to set
logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [29]:
question = "Why do LLMs have problem with mathematical calculations?"

unique_docs = retriever_from_llm.invoke(question)
unique_docs

INFO:langchain.retrievers.multi_query:Generated queries: ['What are the challenges faced by LLMs when it comes to performing mathematical calculations?', 'What factors contribute to the difficulties LLMs encounter when solving math problems?', 'How do LLMs struggle with mathematical computations and what are the underlying reasons for this?']


[Document(page_content="If the above criteria is met within the first 8 weeks of confinement, the female officer will be eligible for the remainder of the first 8 weeks (to be taken continuously) and another 8 weeks of paid ML.\n\nIf the above criteria is met after 8 weeks of the confinement, the officer may be granted up to 8 weeks of full pay ML, which may be taken flexibly over a 12-month period from the birth of the child, subject to mutual agreement between the Reporting Officer and the officer.\u200b\n\nThere will be no retrospective payment for any ML (last 4 weeks of no pay ML) taken before the child obtains Singapore Citizenship.  \n\n\n\n \n\n\n\nNot Eligible for Maternity Leave (ML)\n\nA female officer is not eligible for paid ML, if:\n\nHer date of appointment is at least 3 months preceding the date of confinement,\n\nThe child is not a Singapore Citizen at the time of birth and is her third or subsequent confinements (excluding legally adopted children or step children)\u2