In [1]:
#%pip install -qU langchain-text-splitters
#%pip install -qU "langchain-chroma>=0.1.2"
#%pip install langchain-ollama

from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_ollama.chat_models import ChatOllama
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from IPython.display import display, Markdown


In [2]:
!pip3 freeze | grep tensorflow
!pip3 freeze | grep keras


tensorflow==2.16.2
tensorflow-io-gcs-filesystem==0.37.1
tensorflow-macos==2.16.2
tensorflow-metal==1.1.0
keras==3.6.0


In [2]:
import pymupdf # imports the pymupdf library
doc = pymupdf.open('/Users/lszlpotyondi/Library/Mobile Documents/com~apple~CloudDocs/Downloads/Books/Fundamentals_of_Corporate_Finance_12E_20.pdf') # open a document
text = ''
for page in doc: # iterate the document pages
  text+= page.get_text() # get plain text encoded as UTF-8
print(text)

Fundamentals of 
CORPORATE FINANCE
Financial Management
Block, Hirt, and Danielsen
Foundations of Financial Management
Sixteenth Edition
Brealey, Myers, and Allen
Principles of Corporate Finance
Twelfth Edition
Brealey, Myers, and Allen
Principles of Corporate Finance, Concise 
Second Edition
Brealey, Myers, and Marcus
Fundamentals of Corporate Finance
Ninth Edition
Brooks
FinGame Online 5.0
Bruner, Eades, and Schill
Case Studies in Finance: Managing for 
Corporate Value Creation
Eighth Edition
Cornett, Adair, and Nofsinger
Finance: Applications and Theory
Fourth Edition
Cornett, Adair, and Nofsinger
M: Finance
Fourth Edition
DeMello
Cases in Finance
Third Edition
Grinblatt (editor)
Stephen A. Ross, Mentor: Influence through 
Generations
Grinblatt and Titman
Financial Markets and Corporate Strategy
Second Edition
Higgins
Analysis for Financial Management
Twelfth Edition
Ross, Westerfield, Jaffe, and Jordan
Corporate Finance
Eleventh Edition
Ross, Westerfield, Jaffe, and Jordan
Corporat

In [18]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=5000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)
texts = text_splitter.create_documents([text])
print(texts[0])
print(texts[1])

page_content='Fundamentals of 
CORPORATE FINANCE
Financial Management
Block, Hirt, and Danielsen
Foundations of Financial Management
Sixteenth Edition
Brealey, Myers, and Allen
Principles of Corporate Finance
Twelfth Edition
Brealey, Myers, and Allen
Principles of Corporate Finance, Concise 
Second Edition
Brealey, Myers, and Marcus
Fundamentals of Corporate Finance
Ninth Edition
Brooks
FinGame Online 5.0
Bruner, Eades, and Schill
Case Studies in Finance: Managing for 
Corporate Value Creation
Eighth Edition
Cornett, Adair, and Nofsinger
Finance: Applications and Theory
Fourth Edition
Cornett, Adair, and Nofsinger
M: Finance
Fourth Edition
DeMello
Cases in Finance
Third Edition
Grinblatt (editor)
Stephen A. Ross, Mentor: Influence through 
Generations
Grinblatt and Titman
Financial Markets and Corporate Strategy
Second Edition
Higgins
Analysis for Financial Management
Twelfth Edition
Ross, Westerfield, Jaffe, and Jordan
Corporate Finance
Eleventh Edition
Ross, Westerfield, Jaffe, and J

In [39]:
vector_db = Chroma.from_texts(
    texts=text,
    embedding=OllamaEmbeddings(model="mxbai-embed-large"),
    collection_name="local-rag"
)
print("Vector database created successfully")

KeyboardInterrupt: 

In [None]:
local_model = "llama3.2"  # or whichever model you prefer
llm = ChatOllama(model=local_model)

In [None]:
# Query prompt template
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""Your task is to generate 3 
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)




# Set up retriever
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)



In [None]:
# RAG prompt template
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

In [None]:
# Create chain
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
def chat_with_pdf(question):
    """
    Chat with the PDF using the RAG chain.
    """
    return display(Markdown(chain.invoke(question)))

In [None]:
def chat_with_pdf_2(question):
    """
    Chat with the PDF using the RAG chain.
    """

    results= vector_db.similarity_search_with_score(question, k=5)
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(template)
    prompt = prompt_template.format(context=context_text, question=question)
    #print(prompt)

    response_text = llm.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return response_text

In [None]:
response= chat_with_pdf_2("What are the main ideas of this book?")
print(response.content)

Response: content='The book appears to be a comprehensive finance textbook that covers various topics in finance, including:\n\n1. **Financial Planning**: The importance of financial planning for small businesses, including creating pro forma statements using the percentage of sales approach.\n2. **Valuation of Future Cash Flows**: Two chapters on time value of money (Chapter 5 and Chapter 6), one on bond valuation (Chapter 7), and another on stock valuation (Chapter 8).\n3. **Capital Budgeting**: Three chapters on capital budgeting, including net present value and other investment criteria (Chapter 9), making capital investment decisions (Chapter 10), and project analysis and evaluation (Chapter 11).\n4. **Risk and Return**: A chapter on lessons from capital market history (Chapter 12), which covers topics such as geometric vs. arithmetic returns, capital market history, market efficiency, the equity risk premium, and efficient markets hypothesis.\n\nThe book appears to be designed fo

In [None]:
chat_with_pdf("Who are the authors of this book?")

Unfortunately, I don't see any information about the authors in the provided text snippet. However, based on the context and content of the text, it appears to be a textbook or educational resource related to finance and accounting, possibly from a business or economics program. If you're looking for more specific information about the authors, I recommend checking the book's title page, publisher, or any other relevant metadata that may be included in the text snippet.

In [None]:
chat_with_pdf_2("How many chapters are there and what are their names? Format: There are n chapters. 1. chapter name")

Response: content='There is only one question provided. Here is the answer:\n\n1. Chapter 7 name: Interest Rates and Bond Valuation' additional_kwargs={} response_metadata={'model': 'llama3.2', 'created_at': '2025-02-24T13:02:33.524415Z', 'done': True, 'done_reason': 'stop', 'total_duration': 9600014959, 'load_duration': 22430042, 'prompt_eval_count': 2048, 'prompt_eval_duration': 8193000000, 'eval_count': 26, 'eval_duration': 1383000000, 'message': Message(role='assistant', content='', images=None, tool_calls=None)} id='run-8625e7bb-505e-441a-b5dc-2b8029698975-0' usage_metadata={'input_tokens': 2048, 'output_tokens': 26, 'total_tokens': 2074}
Sources: [None, None, None, None, None]


AIMessage(content='There is only one question provided. Here is the answer:\n\n1. Chapter 7 name: Interest Rates and Bond Valuation', additional_kwargs={}, response_metadata={'model': 'llama3.2', 'created_at': '2025-02-24T13:02:33.524415Z', 'done': True, 'done_reason': 'stop', 'total_duration': 9600014959, 'load_duration': 22430042, 'prompt_eval_count': 2048, 'prompt_eval_duration': 8193000000, 'eval_count': 26, 'eval_duration': 1383000000, 'message': Message(role='assistant', content='', images=None, tool_calls=None)}, id='run-8625e7bb-505e-441a-b5dc-2b8029698975-0', usage_metadata={'input_tokens': 2048, 'output_tokens': 26, 'total_tokens': 2074})

In [34]:
vector_db.delete_collection()