In [30]:
import os
from typing import List
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma


In [28]:

def create_vector_store(pdf_path: str, persist_directory: str = "pdf_store"):
    
    
    
    
    
    
    
    
    
    
    # 5. Persist the vector store
    vector_store.persist()
    print(f"Vector store created and saved to {persist_directory}")
    
    return vector_store

In [31]:
pdf_path = r"..\..\data\pdf_books\BrianDRipley-PatternRecognitionandNeuralNetworks(1996).pdf"  
persist_dir = "pdf_store"  # Directory where the vector store will be saved


### 1. Load PDF

In [34]:
print(f"Loading PDF from {pdf_path}")
loader = PyPDFLoader(pdf_path)
pages = loader.load()


Loading PDF from ..\..\data\pdf_books\BrianDRipley-PatternRecognitionandNeuralNetworks(1996).pdf


In [35]:
print(f"{len(pages)} Pages in the PDF")
pages


415 Pages in the PDF


[Document(metadata={'source': '..\\..\\data\\pdf_books\\BrianDRipley-PatternRecognitionandNeuralNetworks(1996).pdf', 'page': 0}, page_content=''),
 Document(metadata={'source': '..\\..\\data\\pdf_books\\BrianDRipley-PatternRecognitionandNeuralNetworks(1996).pdf', 'page': 1}, page_content=''),
 Document(metadata={'source': '..\\..\\data\\pdf_books\\BrianDRipley-PatternRecognitionandNeuralNetworks(1996).pdf', 'page': 2}, page_content='Pattern Recognition \nand Neural Networks \nB. D. RIPLEY \nUniversity of Oxford '),
 Document(metadata={'source': '..\\..\\data\\pdf_books\\BrianDRipley-PatternRecognitionandNeuralNetworks(1996).pdf', 'page': 3}, page_content='PUBLISHED BY THE PRESS SYNDICATE OF THE UNIVERSITY OF CAMBRIDGE \nThe Pitt Building, Trumpington Street, Cambridge, United Kingdom \nCAMBRIDGE UNIVERSITY PRESS \nThe Edinburgh Building, Cambridge CB2 2RU, UK \n40 West 20th Street, New York NY 10011-4211, USA \n477 Williamstown Road, Port Melbourne, VIC 3207, Australia \nRuiz de Alarco

In [55]:
# 2. Split text into chunks
print("Splitting text into chunks...")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len
)

pages2 = ["qwer","tyui","op[]","adf","ghjk","l;'"]
texts = []
for page in pages:
    cont = page.page_content.replace("\xad\n","").replace("\n","")
    chunks = text_splitter.split_text(cont)
    print(chunks)
    texts.extend(chunks)

print(f"Created {len(texts)} chunks of text")



Splitting text into chunks...
[]
[]
['Pattern Recognition and Neural Networks B. D. RIPLEY University of Oxford']
['PUBLISHED BY THE PRESS SYNDICATE OF THE UNIVERSITY OF CAMBRIDGE The Pitt Building, Trumpington Street, Cambridge, United Kingdom CAMBRIDGE UNIVERSITY PRESS The Edinburgh Building, Cambridge CB2 2RU, UK 40 West 20th Street, New York NY 10011-4211, USA 477 Williamstown Road, Port Melbourne, VIC 3207, Australia Ruiz de Alarcon 13, 28014 Madrid, Spain Dock House, The Waterfront, Cape Town 8001, South Africa http://www.cambridge.org © B. D. Ripley 1996 This book is in copyright. Subject to statutory exception and to the provisions of relevant collective licensing agreements, no reproduction of any part may take place without the written permission of Cambridge University Press. First published 1996 Eighth printing 2005 Printed in the United Kingdom at the University Press, Cambridge A catalogue record for this book is available from the British Library Library of Congress Cata

In [56]:
# 3. Create embeddings
print("Creating embeddings...")
embedding_function = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={'device': 'cpu'}
)


Creating embeddings...


In [57]:
# 4. Create and persist the vector store
print("Creating and persisting vector store...")
vector_store = Chroma.from_texts(
    texts=texts,
    embedding=embedding_function,
    persist_directory=persist_dir,
    collection_name="pdf_collection"
)

Creating and persisting vector store...


In [58]:
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# Load the saved vector store
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={'device': 'cpu'}
)

vector_store = Chroma(
    persist_directory="pdf_store",
    embedding_function=embeddings,
    collection_name="pdf_collection"
)

# Example query
query = "what is bias?"
docs = vector_store.similarity_search(query, k=100)  # Get top 3 most relevant chunks
for i,doc in enumerate(docs):
    print("\n\n\n",i)
    print(doc.page_content)




 0
bias. (It is defined and discussed below.) The widespread use of the



 1
estimate that bias, and correct it using our estimate. For concreteness



 2
the bias. Efron (1982, Chapter 7) gives a suggestive argument why the



 3
less biased, if they are not unbiased in the first place. For example,



 4
. L:j Pr{A I Bj}Pr{Bj} Bayes rule is a rule which attains the Bayes risk, and so is the 'gold-standard', the best possible for that problem. bias has two meanings. (a) The bias of an estimator is the



 5
References 367 
Fukunaga , K. & Hummels , D. M. (1987a) Bias of



 6
two meanings. (a) The bias of an estimator is the difference between its mean and the true value. (b) For a neural network, parameters which are constants (rather than multiplying signals) are often



 7
bias has two meanings. (a) The bias of an estimator is the difference between



 8
biases. 
When the biased training set is created by subsampling a larger



 9
bias. (It is defined and discussed below.) T

In [59]:
doc

Document(metadata={}, page_content='nition. The training set is regarded as a sample from a population of')