In [81]:
import os
from dotenv import load_dotenv
load_dotenv()

# os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")


Step 1: Load documents from PDF file


In [82]:
pdf_directory='./docs'

documents = []

file_paths = [os.path.join(pdf_directory,file) for file in os.listdir(pdf_directory) if file.endswith('.md')]

from langchain.document_loaders import UnstructuredMarkdownLoader

for file_path in file_paths:
    loader = UnstructuredMarkdownLoader(file_path)
    documents.extend(loader.load())


In [83]:
documents

[Document(metadata={'source': './docs\\personal_health_log.md'}, page_content='🧠 Personal Health Log – April 2025\n\n🌡️ Symptoms Tracker\n\nDate Issue Severity Notes Apr 10 Migraine 7/10 Triggered after poor sleep Apr 14 Sore throat 5/10 Resolved with warm liquids Apr 17 Lower back pain 6/10 Possibly from long sitting\n\n🏋️ Exercise Log\n\nDate Activity Duration Notes Apr 12 Upper body workout 45 min Good form, added shoulder press Apr 13 Muay Thai basics 30 min Focused on jab-cross combos Apr 16 Lower body (light) 40 min Split squats, glute bridges\n\n🍽️ Diet Notes\n\nIncreased protein intake (100g/day avg)\n\nMore veggies, started green smoothies\n\nCut back on late-night snacking\n\n🧘 Mental Health\n\nJournaling 3x/week\n\nUsing Insight Timer for 10-min meditation\n\nSleep avg: 6.5–7 hrs\n\n🩺 Upcoming Appointments\n\nGeneral Check-up – May 3, 2025\n\nDentist – May 12, 2025'),
 Document(metadata={'source': './docs\\resume.md'}, page_content='Naheel KK\n\nFull-Stack Developer (MERN) |

In [84]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 300,
    chunk_overlap = 100
)
text_splitted_document = text_splitter.split_documents(documents)

In [85]:
text_splitted_document

[Document(metadata={'source': './docs\\personal_health_log.md'}, page_content='🧠 Personal Health Log – April 2025\n\n🌡️ Symptoms Tracker\n\nDate Issue Severity Notes Apr 10 Migraine 7/10 Triggered after poor sleep Apr 14 Sore throat 5/10 Resolved with warm liquids Apr 17 Lower back pain 6/10 Possibly from long sitting\n\n🏋️ Exercise Log'),
 Document(metadata={'source': './docs\\personal_health_log.md'}, page_content='🏋️ Exercise Log\n\nDate Activity Duration Notes Apr 12 Upper body workout 45 min Good form, added shoulder press Apr 13 Muay Thai basics 30 min Focused on jab-cross combos Apr 16 Lower body (light) 40 min Split squats, glute bridges\n\n🍽️ Diet Notes\n\nIncreased protein intake (100g/day avg)'),
 Document(metadata={'source': './docs\\personal_health_log.md'}, page_content='🍽️ Diet Notes\n\nIncreased protein intake (100g/day avg)\n\nMore veggies, started green smoothies\n\nCut back on late-night snacking\n\n🧘 Mental Health\n\nJournaling 3x/week\n\nUsing Insight Timer for 10-

In [86]:
from langchain_community.vectorstores import FAISS
# from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(text_splitted_document, embedding)

# embeddings = OpenAIEmbeddings()
# vectorstore = FAISS.from_documents(text_splitted_document,embeddings)

In [87]:
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x1f868cd18e0>

In [88]:
query = "When was i suffered Migraine"
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k":3}
)
result = retriever.invoke(query)
print(len(result))
result[0].page_content

3


'🧠 Personal Health Log – April 2025\n\n🌡️ Symptoms Tracker\n\nDate Issue Severity Notes Apr 10 Migraine 7/10 Triggered after poor sleep Apr 14 Sore throat 5/10 Resolved with warm liquids Apr 17 Lower back pain 6/10 Possibly from long sitting\n\n🏋️ Exercise Log'

In [89]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

# Load a Hugging Face pipeline with the right task
flan_pipeline = pipeline("text2text-generation", model="google/flan-t5-large")

# Wrap it in LangChain's LLM class
llm = HuggingFacePipeline(pipeline=flan_pipeline)
# llm = OpenAI(temperature=0)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu
  llm = HuggingFacePipeline(pipeline=flan_pipeline)


In [93]:

query= "Which cities i'm visiting in Japan"
result = qa_chain.invoke({"query": query})
print("\nQuestion: ",query)
print("\nAnswer:",result["result"])



Question:  Which cities i'm visiting in Japan

Answer: Tokyo


In [94]:
from langchain.prompts import PromptTemplate
template = """
You are a helpful assistant that answers questions based on provided documents,

Context information from documents:
{context}
Question: {question}

Answer the question based on the provided documents. 
"""

prompt=PromptTemplate(
    template=template,
    input_variables=["context","question"]
)

In [None]:
flan_pipeline = pipeline("text2text-generation", model="google/flan-t5-large")

# Wrap it in LangChain's LLM class
llm = HuggingFacePipeline(pipeline=flan_pipeline)
# llm = OpenAI(temperature=0)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)

In [97]:
query = "What's my interests"
result= qa_chain.invoke({"query":query})
print("Question: ",query)
print("\nAnswer: ",result["result"])

Question:  What's my interests

Answer:  Muay Thai, AI/ML tools, Obsidian note-taking, productivity hack
