In [3]:
import os
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, OpenAIEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate


In [4]:
loader = PyPDFDirectoryLoader("pdfs")
docs_before_split = loader.load()



In [5]:
from collections import defaultdict
from langchain.document_loaders import PyPDFDirectoryLoader


max_chars_per_pdf = defaultdict(int)

for doc in docs_before_split:
    # Ensure metadata keys exist
    if "source" not in doc.metadata:
        pdf_path = doc.metadata.get("file_path", "unknown.pdf")
        doc.metadata["source"] = os.path.basename(pdf_path)
    if "page" not in doc.metadata:
        doc.metadata["page"] = doc.metadata.get("page_number", -1)

    # Update max chars per PDF
    source = doc.metadata["source"]
    length = len(doc.page_content)
    if length > max_chars_per_pdf[source]:
        max_chars_per_pdf[source] = length

# Print results
for filename, max_len in max_chars_per_pdf.items():
    print(f"{filename}: max chars in a page = {max_len}")


pdfs/DeepLearningCourseSlides.pdf: max chars in a page = 762
pdfs/MLBasics-Unsupervised.pdf: max chars in a page = 724
pdfs/IntroToComputerVision.pdf: max chars in a page = 1343
pdfs/AI101.pdf: max chars in a page = 587
pdfs/MachineLearning-supervised.pdf: max chars in a page = 1315
pdfs/IntroToLLMs.pdf: max chars in a page = 1506


In [6]:
text_splitter =  RecursiveCharacterTextSplitter(
    chunk_size =300,
    chunk_overlap = 50
)
docs_after_split = text_splitter.split_documents(docs_before_split)


In [7]:
docs_after_split[0]

Document(metadata={'producer': 'macOS Version 15.1 (Build 24B2082) Quartz PDFContext', 'creator': 'Keynote', 'creationdate': "D:20250605093129Z00'00'", 'title': 'DeepLearningCourseSlides', 'moddate': "D:20250605093129Z00'00'", 'source': 'pdfs/DeepLearningCourseSlides.pdf', 'total_pages': 84, 'page': 0, 'page_label': '1'}, page_content='Lara WEHBE - August 2024 - TheAIEngineers\nDeep Learning CourseIntermediate Level')

In [8]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)

In [9]:
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

In [10]:
print(f'before split: {avg_char_before_split}')
print(f'after split: {avg_char_after_split}')

before split: 207
after split: 158


In [None]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name= "sentence-transformers/all-MiniLM-L6-v2",
   #model_kwargs = {'device' : 'cpu'},
    encode_kwargs = {'normalize_embeddings' : True}
)

In [12]:

vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

In [38]:
query = "Explain what Artificial Intelligence in simple terms"


In [39]:
relevant_documents = vectorstore.similarity_search(query)

In [40]:
print(relevant_documents)

[Document(id='7d324d25-5973-48e0-a1f6-081e356dff3d', metadata={'producer': 'macOS Version 15.1 (Build 24B2082) Quartz PDFContext', 'creator': 'Keynote', 'creationdate': "D:20250605093059Z00'00'", 'title': 'AI101', 'moddate': "D:20250605093059Z00'00'", 'source': 'pdfs/AI101.pdf', 'total_pages': 22, 'page': 1, 'page_label': '2'}, page_content='What is Artificial Intelligence?'), Document(id='44978063-e979-47ce-883e-79d0fc3af822', metadata={'producer': 'macOS Version 15.1 (Build 24B2082) Quartz PDFContext', 'creator': 'Keynote', 'creationdate': "D:20250605093245Z00'00'", 'title': 'MachineLearningBasics', 'moddate': "D:20250605093245Z00'00'", 'source': 'pdfs/MachineLearning-supervised.pdf', 'total_pages': 84, 'page': 8, 'page_label': '9'}, page_content='What is Artificial Intelligence (AI)? \nAI Deﬁnition\nIt gives the machines the ability to Mimic human behavior.\nArtiﬁcial Intelligence (AI), a term coined by\nemeritus Stanford Professor John \nMcCarthy in 1955,\nwas deﬁned by him as “the

In [41]:
# Perform similarity search
relevant_documents = vectorstore.similarity_search(query)

# Extract only the text content
answers = [doc.page_content for doc in relevant_documents]

# Print just the answers
for i, answer in enumerate(answers):
    print(f"Answer {i+1}:\n{answer}\n{'-'*50}")


Answer 1:
What is Artificial Intelligence?
--------------------------------------------------
Answer 2:
What is Artificial Intelligence (AI)? 
AI Deﬁnition
It gives the machines the ability to Mimic human behavior.
Artiﬁcial Intelligence (AI), a term coined by
emeritus Stanford Professor John 
McCarthy in 1955,
was deﬁned by him as “the science and 
engineering of
making intelligent machines”
--------------------------------------------------
Answer 3:
Artificial Intelligence or Machine Learning?
What’s the diﬀerence?
The terms Artiﬁcial 
Intelligence (AI) and machine 
learning (ML) are often used 
interchangeably, but they are 
not the same.
--------------------------------------------------
Answer 4:
What is an AI Engineer?
And how to become one?
Artiﬁcial intelligence engineers are individuals who use AI and 
machine learning techniques to develop applications and 
systems that can help organizations increase eﬃciency, cut 
costs, increase proﬁts, and make better business decisions.

In [42]:
retriever = vectorstore.as_retriever(search_type="similarity" , search_kwargs={"k" : 3})

In [43]:
from langchain.llms import HuggingFacePipeline

local_llm = HuggingFacePipeline.from_model_id(
    model_id="google/flan-t5-large",
    task="text2text-generation",

    model_kwargs={
        "temperature": 0.2,
        "do_sample": True
    },
    pipeline_kwargs={
        "max_new_tokens": 128
    }
)

Device set to use cpu


In [44]:

output = local_llm.invoke(query)
print(output)


Artificial Intelligence is the study of human intelligence.


In [45]:
prompt_template = """You are an expert assistant.
Use the following context to answer the user's question.

Guidelines:
- If the context contains the answer, provide it concisely in no more than five sentences.
- If the context does not contain the answer, reply: "I can't find the final answer but you may want to check the following links".

Do not repeat the guidelines. Only output the answer.

Context:
{context}

Question: {question}

Answer:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [46]:
retrievalQA = RetrievalQA.from_chain_type(
    llm=local_llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs = {"prompt" : PROMPT}
)

In [47]:
print(retrievalQA.input_keys)

['query']


In [48]:
result=retrievalQA.invoke({"query": query})
print(result)

{'query': 'Explain what Artificial Intelligence in simple terms', 'result': 'It gives the machines the ability to Mimic human behavior. Artificial Intelligence (AI), a term coined by emeritus Stanford Professor John McCarthy in 1955, was defined by him as “the science and engineering of making intelligent machines”', 'source_documents': [Document(id='7d324d25-5973-48e0-a1f6-081e356dff3d', metadata={'producer': 'macOS Version 15.1 (Build 24B2082) Quartz PDFContext', 'creator': 'Keynote', 'creationdate': "D:20250605093059Z00'00'", 'title': 'AI101', 'moddate': "D:20250605093059Z00'00'", 'source': 'pdfs/AI101.pdf', 'total_pages': 22, 'page': 1, 'page_label': '2'}, page_content='What is Artificial Intelligence?'), Document(id='44978063-e979-47ce-883e-79d0fc3af822', metadata={'producer': 'macOS Version 15.1 (Build 24B2082) Quartz PDFContext', 'creator': 'Keynote', 'creationdate': "D:20250605093245Z00'00'", 'title': 'MachineLearningBasics', 'moddate': "D:20250605093245Z00'00'", 'source': 'pdf

In [51]:
print(result.keys())

dict_keys(['query', 'result', 'source_documents'])


In [52]:
relevant_docs = result['source_documents']
print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
print("*" * 100)
for i, doc in enumerate(relevant_docs):
    print(f"Relevant Document #{i+1}:\nSource file: {doc.metadata['source']}, Page: {doc.metadata['page']}\nContent: {doc.page_content}")
    print("-"*100)
    print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')

There are 3 documents retrieved which are relevant to the query.
****************************************************************************************************
Relevant Document #1:
Source file: pdfs/AI101.pdf, Page: 1
Content: What is Artificial Intelligence?
----------------------------------------------------------------------------------------------------
There are 3 documents retrieved which are relevant to the query.
Relevant Document #2:
Source file: pdfs/MachineLearning-supervised.pdf, Page: 8
Content: What is Artificial Intelligence (AI)? 
AI Deﬁnition
It gives the machines the ability to Mimic human behavior.
Artiﬁcial Intelligence (AI), a term coined by
emeritus Stanford Professor John 
McCarthy in 1955,
was deﬁned by him as “the science and 
engineering of
making intelligent machines”
----------------------------------------------------------------------------------------------------
There are 3 documents retrieved which are relevant to the query.
Relevant Document #3