In [23]:
import nest_asyncio
from llama_parse import LlamaParse
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import ChatOllama
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain 
from langchain_core.prompts import ChatPromptTemplate
import os

nest_asyncio.apply()

parser = LlamaParse(
    api_key="llx-IBqi6PPOIgiAGYgzczvLV3e4EKgNVgENHwzFMHRHLUXyke4o",
    result_type = "text",
    verbose=True,
    language = "en"
)

relative_path = "documents"
current_working_directory = os.getcwd()
dirctory = os.path.join(current_working_directory, relative_path)

# list full path of all files in the directory
files = [os.path.join(dirctory, file) for file in os.listdir(dirctory)]

documents = [parser.load_data(file_path=file) for file in files]

for doc in documents:
    print(doc[0].text)  


Started parsing the file under job_id bc580c36-86d5-43f9-8bd8-40b6ee4f7322
Started parsing the file under job_id 16087b28-931b-4365-8bd1-b1fd86eb8cfc
Started parsing the file under job_id d0537435-1be6-48ff-ab92-1955d83974bc
Started parsing the file under job_id 0bf63e2f-38cb-4883-b684-0cd06068a535
Started parsing the file under job_id 370771bf-2b06-4a0c-851f-594015047310
Title: Weather tomorrow - New York, NY
Content:
Title: Tomorrow's Weather in New York - Hourly Forecast and Conditions
Content: The weather tomorrow in New York will be much hotter than today, with
temperatures raising to 24°. The rain will visit again tomorrow in New York and
the expected precipitation are 11 mm. The wind tomorrow will get lighter
comparing to today but still be strong and reach over 24 km/h. The weather in
New York tomorrow is expected to be much warmer than usual, with a forecast
temperature  of 24 °F, compared to an average of 16.8 °F for April 22nd in recent
years. Get accurate weather forecasts 

In [25]:
partitions = []
for doc in documents:
    for i, data in enumerate(doc):
        print(f"Data {i}:")
        print(data.text)
        d = Document(page_content=data.text, metadata={"source": f"data_{i}"})
        partitions.append(d)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=100,  
    separators=["\n\n", "\n", " ", ""], 
)

chunks = text_splitter.split_documents(partitions)

Data 0:
Title: Weather tomorrow - New York, NY
Content:
Data 0:
Title: Tomorrow's Weather in New York - Hourly Forecast and Conditions
Content: The weather tomorrow in New York will be much hotter than today, with
temperatures raising to 24°. The rain will visit again tomorrow in New York and
the expected precipitation are 11 mm. The wind tomorrow will get lighter
comparing to today but still be strong and reach over 24 km/h. The weather in
New York tomorrow is expected to be much warmer than usual, with a forecast
temperature  of 24 °F, compared to an average of 16.8 °F for April 22nd in recent
years. Get accurate weather forecasts for New York, located at latitude 40.714
and longitude -74.006. Stay updated with localized weather information tailored
for New York. Subscribe to receive real-time weather notifications for New York
Remember to  enable notifications in your browser settings to stay informed
(usually at  the top left corner).
Data 0:
Title: Access Denied
Content:
Reference

In [26]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

persist_directory = "./c_db"
# persist_directory = "./nvidia_db"
try:
    vectorstore = Chroma(
        persist_directory=persist_directory,
        collection_name="document_chunks",
        embedding_function=embedding_model
    )
except ValueError:
    vectorstore.reset()
vectorstore.persist()  

vectorstore.add_documents(chunks, ids=[f"chunk_{i}" for i in range(len(chunks))])

['chunk_0', 'chunk_1', 'chunk_2', 'chunk_3', 'chunk_4']

In [37]:
vectorstore = Chroma(
    persist_directory=persist_directory,
    collection_name="document_chunks",
    embedding_function=embedding_model
)

# Use as retriever in LangChain
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

In [35]:
query = "What is the weather in New York tomorrow?"

# Perform similarity search
similar_docs = vectorstore.similarity_search(query, k=3)  # return top 3 relevant chunks
print(len(similar_docs))
# Display the retrieved results
for i, doc in enumerate(similar_docs, 1):
    print(f"\n--- Result {i} ---\n")
    print(doc.page_content)

3

--- Result 1 ---

Title: Tomorrow's Weather in New York - Hourly Forecast and Conditions
Content: The weather tomorrow in New York will be much hotter than today, with
temperatures raising to 24°. The rain will visit again tomorrow in New York and
the expected precipitation are 11 mm. The wind tomorrow will get lighter
comparing to today but still be strong and reach over 24 km/h. The weather in
New York tomorrow is expected to be much warmer than usual, with a forecast
temperature  of 24 °F, compared to an average of 16.8 °F for April 22nd in recent
years. Get accurate weather forecasts for New York, located at latitude 40.714
and longitude -74.006. Stay updated with localized weather information tailored
for New York. Subscribe to receive real-time weather notifications for New York
Remember to  enable notifications in your browser settings to stay informed
(usually at  the top left corner).

--- Result 2 ---

Title: Weather tomorrow - New York, NY
Content:

--- Result 3 ---

Titl

In [38]:
llm = ChatOllama(
    model="llama3",
    temperature=0.2 
)

system_prompt = ("""Answer the question based on the provided guidelines and given context.
**Guidelines:**
- For numerical answers (e.g., metrics, scores), provide only the value.
- For yes/no questions, answer strictly with "Yes" or "No".
- If no relevant context exists, state "NA"."
"Context: {context}"""
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)

query = "What is UniAD Backbone?"
response = chain.invoke({"input": query})

print(response.get('answer'))

NA
