In [1]:
import os
import sys
from dotenv import load_dotenv, find_dotenv
src_path = os.path.abspath("../src")  # Adjust this relative path as needed
sys.path.append(src_path)

from rag.document_loader import DocumentLoader
from rag.vector_store import VectorStore

_ = load_dotenv(find_dotenv())

In [2]:
file_path = "/Users/patrick/projects/doc-chat/data/pdf/The Hundred-Page Machine Learning Book.pdf"
doc_loader = DocumentLoader(file_path)
documents, splits = doc_loader.load_and_split()
print("Number of documents: " + str(len(documents)))
print("Number of splits: " + str(len(splits)))

Number of documents: 152
Number of splits: 385


In [3]:
# document format
documents[0]

Document(metadata={'producer': '3-Heights(TM) PDF Optimization Shell 4.8.25.2 (http://www.pdf-tools.com)', 'creator': 'PyPDF', 'creationdate': '2018-12-18T05:07:46+00:00', 'moddate': '2019-01-22T19:51:34+00:00', 'source': '/Users/patrick/projects/doc-chat/data/pdf/The Hundred-Page Machine Learning Book.pdf', 'total_pages': 152, 'page': 0, 'page_label': '1'}, page_content='The\nHundred-\nPage\nMachine\nLearning\nBook\nAndriy Burkov')

In [4]:
# print first 300 characters of the third page
documents[2].page_content[:300]

'Preface\nLet’s start by telling the truth: machines don’t learn. What a typical “learning machine”\ndoes, is ﬁnding a mathematical formula, which, when applied to a collection of inputs (called\n“training data”), produces the desired outputs. This mathematical formula also generates the\ncorrect outputs'

In [5]:
# split format
splits[0]

Document(metadata={'producer': '3-Heights(TM) PDF Optimization Shell 4.8.25.2 (http://www.pdf-tools.com)', 'creator': 'PyPDF', 'creationdate': '2018-12-18T05:07:46+00:00', 'moddate': '2019-01-22T19:51:34+00:00', 'source': '/Users/patrick/projects/doc-chat/data/pdf/The Hundred-Page Machine Learning Book.pdf', 'total_pages': 152, 'page': 0, 'page_label': '1'}, page_content='The\nHundred-\nPage\nMachine\nLearning\nBook\nAndriy Burkov')

In [6]:
splits[2].page_content

'Preface\nLet’s start by telling the truth: machines don’t learn. What a typical “learning machine”\ndoes, is ﬁnding a mathematical formula, which, when applied to a collection of inputs (called\n“training data”), produces the desired outputs. This mathematical formula also generates the\ncorrect outputs for most other inputs (distinct from the training data) on the condition that\nthose inputs come from the same or a similar statistical distribution as the one the training\ndata was drawn from.\nWhy isn’t that learning? Because if you slightly distort the inputs, the output is very likely\nto become completely wrong. It’s not how learning in animals works. If you learned to play\na video game by looking straight at the screen, you would still be a good player if someone\nrotates the screen slightly. A machine learning algorithm, if it was trained by “looking”\nstraight at the screen, unless it was also trained to recognize rotation, will fail to play the\ngame on a rotated screen.'

In [7]:
splits[3].page_content

'straight at the screen, unless it was also trained to recognize rotation, will fail to play the\ngame on a rotated screen.\nSo why the name “machine learning” then? The reason, as is often the case, is marketing:\nArthur Samuel, an American pioneer in the ﬁeld of computer gaming and artiﬁcial intelligence,\ncoined the term in 1959 while at IBM. Similarly to how in the 2010s IBM tried to market\nthe term “cognitive computing” to stand out from competition, in the 1960s, IBM used the\nnew cool term “machine learning” to attract both clients and talented employees.\nAs you can see, just like artiﬁcial intelligence is not intelligence, machine learning is not\nlearning. However, machine learning is a universally recognized term that usually refers\nto the science and engineering of building machines capable of doing various useful things\nwithout being explicitly programmed to do so. So, the word “learning” in the term is used\nby analogy with the learning in animals rather than literally

In [10]:
# instantiate vector store
vector_store = VectorStore(splits)

In [24]:
answer = vector_store.qa_chain("who is this book for?", k=2)
print(answer['result'])

 This book is for beginners in machine learning who want to gain a basic understanding of the field and for experienced practitioners looking for further self-improvement and direction in solving technical or business problems using machine learning techniques.


In [31]:
source_docs = answer['source_documents']
len(source_docs)

2

In [33]:
source_docs[0]

Document(metadata={'creationdate': '2018-12-18T05:07:46+00:00', 'creator': 'PyPDF', 'moddate': '2019-01-22T19:51:34+00:00', 'page': 2, 'page_label': '3', 'producer': '3-Heights(TM) PDF Optimization Shell 4.8.25.2 (http://www.pdf-tools.com)', 'source': '/Users/patrick/projects/doc-chat/data/pdf/The Hundred-Page Machine Learning Book.pdf', 'total_pages': 152}, page_content='by analogy with the learning in animals rather than literally.\nWho This Book is For\nThis book contains only those parts of the vast body of material on machine learning developed\nsince the 1960s that have proven to have a signiﬁcant practical value. A beginner in machine\nlearning will ﬁnd in this book just enough details to get a comfortable level of understanding\nof the ﬁeld and start asking the right questions.\nPractitioners with experience can use this book as a collection of directions for further\nself-improvement. The book also comes in handy when brainstorming at the beginning of a\nproject, when you try 

In [41]:
source_docs[0]

Document(metadata={'creationdate': '2018-12-18T05:07:46+00:00', 'creator': 'PyPDF', 'moddate': '2019-01-22T19:51:34+00:00', 'page': 2, 'page_label': '3', 'producer': '3-Heights(TM) PDF Optimization Shell 4.8.25.2 (http://www.pdf-tools.com)', 'source': '/Users/patrick/projects/doc-chat/data/pdf/The Hundred-Page Machine Learning Book.pdf', 'total_pages': 152}, page_content='by analogy with the learning in animals rather than literally.\nWho This Book is For\nThis book contains only those parts of the vast body of material on machine learning developed\nsince the 1960s that have proven to have a signiﬁcant practical value. A beginner in machine\nlearning will ﬁnd in this book just enough details to get a comfortable level of understanding\nof the ﬁeld and start asking the right questions.\nPractitioners with experience can use this book as a collection of directions for further\nself-improvement. The book also comes in handy when brainstorming at the beginning of a\nproject, when you try 