In [None]:
import os
from typing import List
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma


Loading PDF from ..\..\data\pdf_books\BrianDRipley-PatternRecognitionandNeuralNetworks(1996).pdf
Splitting text into chunks...
Created 1329 chunks of text
Creating embeddings...


  embedding_function = HuggingFaceEmbeddings(







Creating and persisting vector store...
Vector store created and saved to pdf_store

Vector store creation completed!
You can now load this vector store from pdf_store for querying


  vector_store.persist()


In [None]:
def load_pdf(pdf_path: str) -> List:
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()
    return pages

def create_vector_store(pdf_path: str, persist_directory: str = "pdf_store"):
    # 1. Load PDF
    print(f"Loading PDF from {pdf_path}")
    pages = load_pdf(pdf_path)
    
    # 2. Split text into chunks
    print("Splitting text into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    
    texts = []
    for page in pages:
        chunks = text_splitter.split_text(page.page_content)
        texts.extend(chunks)
    
    print(f"Created {len(texts)} chunks of text")
    
    # 3. Create embeddings
    print("Creating embeddings...")
    embedding_function = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2",
        model_kwargs={'device': 'cpu'}
    )
    
    # 4. Create and persist the vector store
    print("Creating and persisting vector store...")
    vector_store = Chroma.from_texts(
        texts=texts,
        embedding=embedding_function,
        persist_directory=persist_directory,
        collection_name="pdf_collection"
    )
    
    # 5. Persist the vector store
    vector_store.persist()
    print(f"Vector store created and saved to {persist_directory}")
    
    return vector_store

In [None]:
pdf_path = r"..\..\data\pdf_books\BrianDRipley-PatternRecognitionandNeuralNetworks(1996).pdf"  
persist_dir = "pdf_store"  # Directory where the vector store will be saved

vector_store = create_vector_store(pdf_path, persist_dir)

print("\nVector store creation completed")

In [8]:
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# Load the saved vector store
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={'device': 'cpu'}
)

vector_store = Chroma(
    persist_directory="pdf_store",
    embedding_function=embeddings,
    collection_name="pdf_collection"
)

# Example query
query = "what is supervised learning?"
docs = vector_store.similarity_search(query, k=3)  # Get top 3 most relevant chunks
for i,doc in enumerate(docs):
    print("\n\n\n",i)
    print(doc.page_content)




 0
test set A set of examples used only to assess the performance of a fully­
specified classifier. 
training set A set of examples used for learning, that is to fit the parameters 
of the classifier. 
uniform convergence A sequence of functions fn converges uniformly to f if 
maxx lfn(x)-f(x)l ---+ 0 as n ---+ oo. We have uniform convergence on 
compacta if this holds whenever the maximum is taken over any compact 
set K. 
unsupervised learning Discovering groupings in the training set when none are 
pre-specified. 
updating Changing the classifier when new examples become available, possi­
bly lacking their true classifications. 
validation set A set of examples used to tune the parameters of a classifier, for 
example to choose the number of hidden units in a neural network. 
vector quantization A method of encoding data for signal transmission, in 
which a vector is replaced by one of a finite number of representatives. 
See page 201.



 1
latter have been experienced) . But we