In [1]:
%load_ext dotenv
%dotenv ../.env

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import os

In [18]:
file_loader = PyPDFLoader
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) 
PATH_TO_PDFS = './reports/24Q1USBANK'
def return_splits(path, file_loader, text_splitter):
    docs = []
    for f in os.listdir(path):
        dir_path = os.path.join(path, f)
        if os.path.isfile(dir_path):
                loader = file_loader(dir_path)
        docs.extend(loader.load())
    splits = text_splitter.split_documents(docs)
    return splits


In [5]:
from langchain_openai import  ChatOpenAI


In [52]:
llm = ChatOpenAI()

In [7]:
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, PromptTemplate


In [44]:
prompt = ChatPromptTemplate.from_messages([
    ("system", """Read the content and determine if it contains information relevant to the question. If no relevant information is containted, respond simply with "NONE". If relevant information is contained, tell me briefly what the content relevant to the question is."""),
   ("human", "Content: \n {content} \n\n Question: {question}")
])

In [45]:
prompt.invoke({"content": "this is content", "question": "this is question"})

ChatPromptValue(messages=[SystemMessage(content='Read the content and determine if it contains information relevant to the question. If no relevant information is containted, respond simply with "NONE". If relevant information is contained, tell me briefly what the content relevant to the question is.'), HumanMessage(content='Content: \n this is content \n\n Question: this is question')])

In [14]:
from langchain_core.runnables import *

In [46]:
prompt_chain = RunnableParallel({
    "question" : lambda i : i['question'],
    "content" : lambda i: i['content']
}) | prompt

In [53]:
chain = RunnableSequence(
    RunnableLambda(lambda inp: prompt_chain.batch([{"question": inp['question'], "content" : s} for s in inp['docs']])) , 
    RunnableLambda(lambda prompts: llm.batch(inputs=prompts))
)

In [41]:
question = "how have financial statement changed over since Q1"
splits = return_splits(path='./reports/24Q1USBANK', file_loader=file_loader, text_splitter=text_splitter)
earnings_call_pres_splits = [s for s in splits if s.metadata['source'] == './reports/24Q1USBANK\\2Q24-Earnings-Call-Presentation.pdf']

In [54]:
res = chain.invoke({"question" : question, "docs" : earnings_call_pres_splits})

In [56]:
res[0].content

'NONE'

In [57]:
with open('res.txt', "+w") as f:
    for i, r in enumerate(res, 1): 
        f.write(f"split {i}\n")
        f.write(f"{r.content}\n\n")