In [42]:
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader

from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import FastEmbedEmbeddings
from langchain.docstore.document import Document
import sys


In [43]:
def ingest():
    # Load the Excel file
    file_path = r"Mutti_Ullah MaxRemind - IET SIT-1 Log Book.xlsx"
    df = pd.read_excel(file_path)  # Adjust `read_excel` parameters as needed
    
    # Combine data into a single string (or process row by row if needed)
    content = "\n".join(df.astype(str).apply(lambda x: " ".join(x), axis=1))
    
    # Create a document-like structure
    documents = [Document(page_content=content)]
    
    # Split the text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} document(s) into {len(chunks)} chunks.")
    
    # Embed and create a vector store
    embedding = FastEmbedEmbeddings()
    Chroma.from_documents(documents=chunks, embedding=embedding, persist_directory="./sql_chroma_db")

In [44]:
ingest()

Split 1 document(s) into 11 chunks.



etching 5 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<?, ?it/s]

In [45]:
from huggingface_hub import login
access_token_read = "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"
access_token_write = "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"
login(token = access_token_read)

In [46]:
def rag_chain():
    model = ChatOllama(model="llama3.2")
    #
    prompt = PromptTemplate.from_template(
        """
        <s> [Instructions] You are a friendly assistant. Answer the question based only on the following context. 
         {input}. [/Instructions] </s> 
        [Instructions] Question: {input} 
        Context: {context} 
        Answer: [/Instructions]
        """
    )
    #Load vector store
    embedding = FastEmbedEmbeddings()
    vector_store = Chroma(persist_directory="./sql_chroma_db", embedding_function=embedding)

    #Create chain
    retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={
            "k": 3,
            "score_threshold": 0.1,
        },
    )

    document_chain = create_stuff_documents_chain(model, prompt)
    chain = create_retrieval_chain(retriever, document_chain)
    #
    return chain

In [47]:
print("Test")

Test


In [48]:
def ask(query: str):
    #
    chain = rag_chain()
    # invoke chain
    result = chain.invoke({"input": query})
    # print results
    print(result["answer"])
    for doc in result["context"]:
        print("Source: ", doc.metadata["source"])

In [40]:
ask("How do we check null values?")


etching 5 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<?, ?it/s]

Unfortunately, the provided context does not mention how to check null values. The text appears to be a task log from a training program for React Native development, with no relevant information about checking null values.
Source:  E:\_Github Projects\realtime-chat\api\Mutti Ullah_IET SIT-1 Training Logbook.pdf
Source:  E:\_Github Projects\realtime-chat\api\Mutti Ullah_IET SIT-1 Training Logbook.pdf


KeyError: 'source'

In [49]:
ask("tasks assigned on October 11")


etching 5 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<?, ?it/s]

The task assigned on October 11 was a Leave.
Source:  E:\_Github Projects\realtime-chat\api\Mutti Ullah_IET SIT-1 Training Logbook.pdf
Source:  E:\_Github Projects\realtime-chat\api\Mutti Ullah_IET SIT-1 Training Logbook.pdf
Source:  E:\_Github Projects\realtime-chat\api\Mutti Ullah_IET SIT-1 Training Logbook.pdf


In [24]:
ask("list all practice names")


etching 5 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 5003.94it/s]

The practice names mentioned in the context are:

1. Backend Debugging
2. Data Validation
3. QA Testing
4. Excel Formatting 

Note that there are also project-specific names mentioned, such as "Django API for Excel Formatting" and "OCR API Quality Assurance", but these seem to be more specific to projects rather than general practice names.
Source:  E:\_Github Projects\realtime-chat\api\Mutti Ullah_IET SIT-1 Training Logbook.pdf
Source:  E:\_Github Projects\realtime-chat\api\Mutti Ullah_IET SIT-1 Training Logbook.pdf
Source:  E:\_Github Projects\realtime-chat\api\Mutti Ullah_IET SIT-1 Training Logbook.pdf


In [16]:
ask("list all practice ids")


Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1
███████████████████████████| 5/5 [00:00<?, ?it/s]

Here is the list of practice IDs:

1. 1012
2. 1017
3. 1018
4. 1021
5. 1024
6. 1025
7. 1027
8. 1069
9. 1086
10. 1093
11. 1099
12. 1101
13. 1106
14. 1120
15. 1133
16. 1143
Source:  E:\_Github Projects\realtime-chat\api\Practice ID.pdf


In [17]:
ask("is any practice id or practice name repeating?")


Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1
███████████████████████████| 5/5 [00:00<?, ?it/s]

No Context available for this question is any practice id or practice name repeating?
Source:  E:\_Github Projects\realtime-chat\api\Practice ID.pdf
