### 1. Load Text

In [1]:
doc_path = (r"./data/hr_policy.txt")

# Open the file
with open(doc_path, 'r') as f:
    # Read the file
    contents = f.read()

In [2]:
# set up tokenizer
import tiktoken
tokenizer = tiktoken.get_encoding('p50k_base')


# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

# sample
tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

28

### 2. Create chunking function

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

chunks = text_splitter.split_text(contents)
chunks[0]

'HR POLICY MANUAL - LEAVE POLICY\nI. PURPOSE\nThis policy is designed to provide a clear and consistent understanding of the leave benefits provided by our company to its employees. It covers the rules and regulations regarding Vacation Leave, Sick Leave, and Service Incentive Leave.\nII. SCOPE\nThis policy applies to all regular full-time employees of the company, regardless of their position or department.\nIII. LEAVE POLICY\nA. Vacation Leave\n1.\tEligibility and Accrual: All regular full-time employees are eligible for Vacation Leave. Employees will earn 1.25 days of Vacation Leave per month of service, accruing to 15 days per year.\n2.\tApplication: Leave applications must be submitted through the Employee Self Service portal at least one day before the intended leave date. Approval from the immediate supervisor is required.\n3.\tUnused Leave: Unused Vacation Leave can be carried over to the next year. However, the total accumulated leave should not exceed 30 days. Any excess leav

### 3. Store Data chunks in Chroma DB

In [4]:
import chromadb

client = chromadb.PersistentClient('vectorstore')
collection = client.get_or_create_collection(name="policies")

In [5]:
import uuid

if not collection.count():
            for data in chunks:
                collection.add(documents=data, ids=[str(uuid.uuid4())])


### 4. Retrive data 

In [6]:
results = collection.query(
    query_texts=["Are employees on probation allowed to have vacation leaves?"],
    n_results=2
)
results

{'ids': [['2ead9b74-e0fa-4761-8c16-bfe63effe912',
   'da10132c-bb5f-474b-9793-41195e034221']],
 'distances': [[0.7031219871426202, 0.7481169960333809]],
 'metadatas': [[None, None]],
 'embeddings': None,
 'documents': [["5.\tDuring Probation: Employees on probation are eligible for On-Call Duty.\nH. Travel Time\n1.\tEligibility and Accrual: All regular full-time employees who are required to travel for work are eligible for Travel Time. The duration will be equivalent to the duration of the travel.\n2.\tApplication: Applications must be submitted through the Employee Self Service portal at least one week before the intended travel date. Approval from the immediate supervisor is required.\n3.\tUnused Leave: There is no carryover or forfeiture for Travel Time as it is event-based.\n4.\tEncashment: Travel Time cannot be encashed.\n5.\tDuring Probation: Employees on probation are eligible for Travel Time.\nI. Emergency Duty\n1.\tEligibility and Accrual: All regular full-time employees who 