## Personal Laptop Assistant

### A question answering agent that is an expert knowledge worker
### To be used by persons who have saved lot of documents on their
### laptops and find it difficult to fetch information from them 
### with minimum effort and time
### The agent needs to be accurate and the solution should be low cost.

This project will use RAG (Retrieval Augmented Generation) to ensure our question/answering assistant has high accuracy.

In [68]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [69]:
# imports for langchain, plotly and Chroma

#from langchain.document_loaders import DirectoryLoader, TextLoader
#from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
#from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

In [70]:
import traceback
from langchain.document_loaders import (
    DirectoryLoader,
    TextLoader,
    PyPDFLoader,
    UnstructuredWordDocumentLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from langchain.vectorstores import Chroma

In [71]:
# price is a factor for our company, so we're going to use a low cost model

MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [72]:
# Load environment variables in a file called .env

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [73]:
SUPPORTED_EXTENSIONS = {
    '.txt': TextLoader,
    '.md': TextLoader,
    '.pdf': PyPDFLoader,
    '.docx': UnstructuredWordDocumentLoader,
}

In [74]:
def load_documents_from_folder(folder_path):
    documents = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            ext = os.path.splitext(file)[1].lower()
            loader_class = SUPPORTED_EXTENSIONS.get(ext)

            if not loader_class:
                print(f"Skipping unsupported file: {file_path}")
                continue

            try:
                print(f"Loading: {file_path}")
                loader = loader_class(file_path)
                docs = loader.load()
                documents.extend(docs)
            except Exception as e:
                print(f"Failed to load {file_path}: {e}")
                traceback.print_exc()
    return documents

In [75]:
if __name__ == "__main__":
    folder_path = "C:\\Users\\mangu\\Documents\\Project\\llm_engineering\\week5\\knowledge-base"  # <- Replace this with your folder path
    try:
        print(f"\n📁 Scanning folder: {folder_path}")
        documents = load_documents_from_folder(folder_path)

        if not documents:
            print("No documents found. Exiting.")
            exit

        print("\n✂️ Splitting documents into chunks...")
        splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        split_docs = splitter.split_documents(documents)

        print("\n🧠 Creating embeddings...")
        embeddings = OpenAIEmbeddings()

        # Delete if already exists
        if os.path.exists(db_name):
            Chroma(persist_directory="chroma_db", embedding_function=embeddings).delete_collection()
    
        print("\n💾 Storing embeddings in Chroma DB...")
        vectordb = Chroma.from_documents(documents, embedding=embeddings, persist_directory="chroma_db")
        vectordb.persist()

        print(f"\n✅ Embeddings successfully saved to Chroma at: chroma_db")
        print(f"Vectorstore created with {vectordb._collection.count()} documents")

        # Let's investigate the vectors

        collection = vectordb._collection
        count = collection.count()
        
        sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
        dimensions = len(sample_embedding)
        print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

    except Exception as e:
        print(f"\n❌ Error during embedding creation: {e}")
        traceback.print_exc()


📁 Scanning folder: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base
Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\company\about.md
Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\company\careers.md
Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\company\overview.md
Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\company\.ipynb_checkpoints\careers-checkpoint.md
Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\company\.ipynb_checkpoints\overview-checkpoint.md
Skipping unsupported file: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\contracts\.md
Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\contracts\Contract with Apex Reinsurance for Rellm.md
Failed to load C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\contracts\Contract with Apex Re

Traceback (most recent call last):
  File "C:\Users\mangu\anaconda3\envs\llms\Lib\site-packages\langchain_community\document_loaders\text.py", line 43, in lazy_load
    text = f.read()
           ^^^^^^^^
  File "C:\Users\mangu\anaconda3\envs\llms\Lib\encodings\cp1252.py", line 23, in decode
    return codecs.charmap_decode(input,self.errors,decoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 156: character maps to <undefined>

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\mangu\AppData\Local\Temp\ipykernel_16424\754545630.py", line 16, in load_documents_from_folder
    docs = loader.load()
           ^^^^^^^^^^^^^
  File "C:\Users\mangu\anaconda3\envs\llms\Lib\site-packages\langchain_core\document_loaders\base.py", line 32, in load
    return list(self.lazy_load())
           ^^^^^^^^^^^^^^^^^^^^^^
  File "

Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\contracts\important-documents\Trupti Manguesh Borker.pdf
Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\employees\Alex Chen.md
Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\employees\Alex Harper.md
Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\employees\Alex Thomson.md
Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\employees\Avery Lancaster.md
Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\employees\eBook-Scaling-RAG-Systems-from-POC-to-Production-–-2025.pdf
Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\employees\Emily Carter.md
Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\employees\Emily Tran.md
Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\employees

Traceback (most recent call last):
  File "C:\Users\mangu\anaconda3\envs\llms\Lib\site-packages\langchain_community\document_loaders\text.py", line 43, in lazy_load
    text = f.read()
           ^^^^^^^^
  File "C:\Users\mangu\anaconda3\envs\llms\Lib\encodings\cp1252.py", line 23, in decode
    return codecs.charmap_decode(input,self.errors,decoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 1305: character maps to <undefined>

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\mangu\AppData\Local\Temp\ipykernel_16424\754545630.py", line 16, in load_documents_from_folder
    docs = loader.load()
           ^^^^^^^^^^^^^
  File "C:\Users\mangu\anaconda3\envs\llms\Lib\site-packages\langchain_core\document_loaders\base.py", line 32, in load
    return list(self.lazy_load())
           ^^^^^^^^^^^^^^^^^^^^^^
  File 

Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\employees\Maxine Thompson.md
Failed to load C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\employees\Maxine Thompson.md: Error loading C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\employees\Maxine Thompson.md
Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\employees\Oliver Spencer.md
Failed to load C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\employees\Oliver Spencer.md: Error loading C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\employees\Oliver Spencer.md
Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\employees\Samantha Greene.md
Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\employees\Samuel Trenton.md
Loading: C:\Users\mangu\Documents\Project\llm_engineering\week5\knowledge-base\employees\.ipynb_checkpoints\Jordan K

Traceback (most recent call last):
  File "C:\Users\mangu\anaconda3\envs\llms\Lib\site-packages\langchain_community\document_loaders\text.py", line 43, in lazy_load
    text = f.read()
           ^^^^^^^^
  File "C:\Users\mangu\anaconda3\envs\llms\Lib\encodings\cp1252.py", line 23, in decode
    return codecs.charmap_decode(input,self.errors,decoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 3419: character maps to <undefined>

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\mangu\AppData\Local\Temp\ipykernel_16424\754545630.py", line 16, in load_documents_from_folder
    docs = loader.load()
           ^^^^^^^^^^^^^
  File "C:\Users\mangu\anaconda3\envs\llms\Lib\site-packages\langchain_core\document_loaders\base.py", line 32, in load
    return list(self.lazy_load())
           ^^^^^^^^^^^^^^^^^^^^^^
  File 


✅ Embeddings successfully saved to Chroma at: chroma_db
Vectorstore created with 3144 documents
There are 3,144 vectors with 1,536 dimensions in the vector store


In [76]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG; k is how many chunks to use
retriever = vectordb.as_retriever(search_kwargs={"k": 50})

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory, callbacks=[StdOutCallbackHandler()])

In [77]:
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    print("\nAnswer:", result["answer"])
    return result["answer"]

In [78]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7901

To create a public link, set `share=True` in `launch()`.




[1m> Entering new ConversationalRetrievalChain chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
Trupti Manguesh Borker 
Phone: +91-7798884528 
E-mail : truptimborker@gmail.com 
Result driven professional with MBA in finance & Marketing with over 17 + years of experience diversified into 
Portfolio/Program/project governance, Project management office (PMO), managing IT projects as a PM & scrum 
master and in Finance & Accounts domain. 
Key Skills 
• Rich experience of working on Programs/Projects in banking and finance as a Portfolio Governance / 
PMO / PM / SM role. 
• Proficiently adept at implementing various project methodologies such as Agile/Scrum, Waterfall, and 
Kanban, while utilizing essential 

# Exercises

Try applying this to your own folder of data, so that you create a personal knowledge worker, an expert on your own information!