In [66]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings, StorageContext, Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
#from llama_index.postprocessor.cohere_rerank import CohereRerank
from dotenv import load_dotenv
import os
import pypdf
import chromadb

In [10]:
load_dotenv()

True

In [56]:
def load_documents_with_metadata_included(data_path:str):
    all_docs = []
    for filename in os.listdir(data_path):
        if not filename.endswith('.pdf'):
            continue
        file_path = os.path.join(data_path, filename)
        reader = pypdf.PdfReader(file_path)

        if "lecture" in filename.lower():
            doc_type = "lecture"

        if 'lecture' not in filename.lower():
            doc_type = "textbook"

        for page_num, page in enumerate(reader.pages):
            text = page.extract_text()
            doc = Document(text = text, 
                           metadata = {
                               "file_name" : filename, 
                               "page_num" : page_num, 
                               "doc_type" : doc_type,
                               "course" : "Machine Learning"
                           })     
            all_docs.append(doc)
            

    return all_docs    

In [57]:
documents = load_documents_with_metadata_included("../data")



In [58]:
documents

[Document(id_='d6890b80-96a3-42b6-9aec-f64227157260', embedding=None, metadata={'file_name': 'ST443_Lecture_5.pdf', 'page_num': 0, 'doc_type': 'lecture', 'course': 'Machine Learning'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='ST443: Machine Learning and Data Mining\nMilan Vojnović\nDepartment of Statistics\nLondon School of Economics and Political Science\nLecture 5: Regularisation\n29 Oct 2024', path=None, url=None, mimetype=None), image_resource=None, audio_resource=None, video_resource=None, text_template='{metadata_str}\n\n{content}'),
 Document(id_='e4b47818-c484-4621-91af-2223a4dacd4c', embedding=None, metadata={'file_name': 'ST443_Lecture_5.pdf', 'page_num': 1, 'doc_type': 'lecture', 'course': 'Machine Learning'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key

In [50]:
#required_exts = ['.pdf', '.tex']
#reader = SimpleDirectoryReader(input_dir = "../data", required_exts = required_exts)

In [49]:
#docs = reader.load_data()

In [13]:
print(f"Loaded {len(docs)} documents")

Loaded 1789 documents


In [14]:
# See what one chunk looks like
print(f"Chunk length: {len(docs[0].text)} characters")
print(f"Preview: {docs[0].text[:500]}")

Chunk length: 147 characters
Preview: Gareth James • Daniela Witten •
Trevor Hastie • Robert Tibshirani
An Introduction to Statistical
Learning
with Applications in R
Second Edition
123


We will change the default embedding model from the OpenAI one to the sentence transformer model from HuggingFace as it is open-source and free.

In [59]:
Settings.embed_model = HuggingFaceEmbedding(model_name = 'sentence-transformers/all-MiniLM-L6-V2')

2026-02-13 00:18:30,989 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-V2
2026-02-13 00:18:31,145 - INFO - HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-V2/resolve/main/modules.json "HTTP/1.1 307 Temporary Redirect"
2026-02-13 00:18:31,245 - INFO - HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json "HTTP/1.1 307 Temporary Redirect"
2026-02-13 00:18:31,260 - INFO - HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json "HTTP/1.1 200 OK"
2026-02-13 00:18:31,363 - INFO - HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-V2/resolve/main/config_sentence_transformers.json "HTTP/1.1 307 Temporary Redirect"
2026-02-13 00:18:31,493 - INFO - HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sen

In [20]:
chroma_client = chromadb.PersistentClient(path = './chroma')
chroma_collection = chroma_client.create_collection('ml_textbooks') 

In [60]:
vector_store = ChromaVectorStore(chroma_collection= chroma_collection,)
storage_context = StorageContext.from_defaults(vector_store = vector_store)

In [61]:
index = VectorStoreIndex.from_documents(documents, storage_context = storage_context)

In [62]:
query_engine = index.as_query_engine(response_mode = 'tree_summarize', verbose = True,  similarity_top_k = 5)

response = query_engine.query("What is gradient descent? Return answers only based off of the books given to you.")

1 text chunks after repacking


2026-02-13 00:19:07,880 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [63]:
print(f"Answer : {response.response}")
print("\nSources: ")
for i, node in enumerate(response.source_nodes, 1):
    print(f"\n{i}. Score: {node.score:.3f}")
    print(f"   File: {node.metadata.get('file_name', 'Unknown')}")
    print(f"   Text: {node.text[:200]}...")

Answer : Gradient descent is a first-order optimization algorithm used to find a local minimum of a function by taking steps proportional to the negative of the gradient of the function at the current point. The algorithm involves iteratively updating the parameters based on the negative gradient direction to converge towards a local minimum.

Sources: 

1. Score: 0.534
   File: mml-book.pdf
   Text: 228 Continuous Optimization
where f : Rd → R is an objective function that captures the machine
learning problem at hand. We assume that our functionf is differentiable,
and we are unable to analytica...

2. Score: 0.485
   File: mml-book.pdf
   Text: 228 Continuous Optimization
where f : Rd → R is an objective function that captures the machine
learning problem at hand. We assume that our functionf is differentiable,
and we are unable to analytica...

3. Score: 0.450
   File: mml-book.pdf
   Text: 230 Continuous Optimization
Although the “undo” step seems to be a waste of resources, using