In [9]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings, StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from dotenv import load_dotenv
import os
import chromadb

In [10]:
load_dotenv()

True

In [11]:
required_exts = ['.pdf', '.tex']
reader = SimpleDirectoryReader(input_dir = "../data", required_exts = required_exts)

In [12]:
docs = reader.load_data()



In [13]:
print(f"Loaded {len(docs)} documents")

Loaded 1462 documents


In [14]:
# See what one chunk looks like
print(f"Chunk length: {len(docs[0].text)} characters")
print(f"Preview: {docs[0].text[:500]}")

Chunk length: 147 characters
Preview: Gareth James • Daniela Witten •
Trevor Hastie • Robert Tibshirani
An Introduction to Statistical
Learning
with Applications in R
Second Edition
123


We will change the default embedding model from the OpenAI one to the sentence transformer model from HuggingFace as it is open-source and free.

In [15]:
Settings.embed_model = HuggingFaceEmbedding(model_name = 'sentence-transformers/all-MiniLM-L6-V2')

2026-02-12 13:37:31,975 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-V2
2026-02-12 13:37:32,185 - INFO - HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-V2/resolve/main/modules.json "HTTP/1.1 307 Temporary Redirect"
2026-02-12 13:37:32,286 - INFO - HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json "HTTP/1.1 307 Temporary Redirect"
2026-02-12 13:37:32,302 - INFO - HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json "HTTP/1.1 200 OK"
2026-02-12 13:37:32,402 - INFO - HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-V2/resolve/main/config_sentence_transformers.json "HTTP/1.1 307 Temporary Redirect"
2026-02-12 13:37:32,502 - INFO - HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sen

In [17]:
chroma_client = chromadb.PersistentClient(path = './chroma')
chroma_collection = chroma_client.create_collection(name = "textbooks")

In [18]:
vector_store = ChromaVectorStore(chroma_collection= chroma_collection)
storage_context = StorageContext.from_defaults(vector_store = vector_store)

In [19]:
index = VectorStoreIndex.from_documents(docs, storage_context = storage_context)

In [27]:
query_engine = index.as_query_engine(response_mode = 'tree_summarize', verbose = True,  similarity_top_k = 3)

response = query_engine.query("What is gradient descent? Give me citations from where you get the information from")

1 text chunks after repacking


2026-02-12 13:53:11,173 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [29]:
print(response.response)

Gradient descent is a method used to minimize an objective function by iteratively moving in the direction of the steepest decrease of the function. It involves calculating the gradient of the objective function at a current point and updating the parameters in the opposite direction of the gradient to reach a local minimum. The information provided is from pages 446, 447, and 448 of the document located at the file path /Users/aravindrajeshmenon/Documents/DataScienceProjects/Projects/RAG/notebooks/../data/ISLRv2_corrected_June_2023.pdf.


In [30]:
print("\nSources:")
for i, node in enumerate(response.source_nodes, 1):
    print(f"\n{i}. Score: {node.score:.3f}")
    print(f"   File: {node.metadata.get('file_name', 'Unknown')}")
    print(f"   Text: {node.text[:200]}...")


Sources:

1. Score: 0.351
   File: ISLRv2_corrected_June_2023.pdf
   Text: 10.7 Fitting a Neural Network 435
−1.0 −0.5 0.0 0.5 1.0
0123456
θ
R(θ)
θ0θ1 θ2 θ7
●●
●
●
R(θ0)R(θ1)
R(θ2)
R(θ7)
FIGURE 10.17.Illustration of gradient descent for one-dimensional θ. The
objective funct...

2. Score: 0.348
   File: ISLRv2_corrected_June_2023.pdf
   Text: 10.7 Fitting a Neural Network 437
051015202530
0.10.20.30.4
Epochs
Value of Objective Function
Training SetValidation Set
051015202530
0.000.020.040.060.080.100.12
Epochs
Classification Error
FIGURE 1...

3. Score: 0.347
   File: ISLRv2_corrected_June_2023.pdf
   Text: 436 10. Deep Learning
moveθa little in theoppositedirection (since we wish to go downhill):
θm+1←θm−ρ∇R(θm). (10.27)
For a small enough value of thelearning rateρ, this step will decrease the learning...
