In [1]:
%load_ext dotenv
%dotenv ../.env

# Loading docs

In [2]:
from langchain_community.document_loaders import PyPDFLoader
#document loaders
file_loader= PyPDFLoader

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
#model
llm = ChatOpenAI(model="gpt-3.5-turbo-0125")
embedding_model = OpenAIEmbeddings()


from langchain_community.vectorstores import FAISS
#vector store

from langchain_text_splitters import RecursiveCharacterTextSplitter
#splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) 

import os

## location of docs

In [3]:
PATH_TO_DIR = './reports/24Q1USBANK'


In [4]:
INDEX_PATH = os.path.join(PATH_TO_DIR, "faiss_index")
if(not os.path.exists(INDEX_PATH)):
    docs = []
    for f in os.listdir(PATH_TO_DIR):
        file_path = os.path.join(PATH_TO_DIR, f)
        if os.path.isfile(file_path):
            loader = file_loader(file_path)
            docs.extend(loader.load())

    splits = text_splitter.split_documents(docs)

    vectorstore = FAISS.from_documents(splits, embedding_model)
    vectorstore.save_local(INDEX_PATH)
else:
    vectorstore = FAISS.load_local(INDEX_PATH, embedding_model, allow_dangerous_deserialization=True)

In [5]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage
from langchain_core.prompts import HumanMessagePromptTemplate


# description of Agent task

In [6]:
DESCRIPTION_OF_PURPOSE = """You are a AI agent for extracting the economic predictions from text. Use the given context to summarize information in it"""

In [7]:
chat_prompt_extr_narr = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=(DESCRIPTION_OF_PURPOSE)
        ),
        HumanMessagePromptTemplate.from_template("Context:\n{context}"),
    ]
)
print(chat_prompt_extr_narr.invoke({'context': "hello"}))


messages=[SystemMessage(content='You are a AI agent for extracting the economic predictions from text. Use the given context to summarize information in it'), HumanMessage(content='Context:\nhello')]


In [8]:
from langchain_core.documents.base import Document

def format_docs(docs: Document) -> str: 
    return "\n------\n".join(doc.page_content for doc in docs)

In [9]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableSequence, RunnableAssign, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

# RETRIEVER

In [10]:
#fiddle with this
#retriever  = vectorstore.as_retriever()

# Only retrieve documents that have a relevance score
# Above a certain threshold
retriever = vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={'score_threshold': 0.5}
)


# CHAIN
the current chain just has the documents used and writes out the answer

In [11]:
chain = (
    RunnableParallel({
    "context": retriever
    })
    | RunnableAssign(
        mapper=RunnableParallel({
            "answer": RunnableAssign(
                mapper={"context": RunnableLambda(lambda state:format_docs(state['context']))}
            )
            | chat_prompt_extr_narr
            | llm
            | StrOutputParser()
        })
    )
)

In [12]:
QUESTION = "What are the predictions for the future economic conditions made by this document"
answer = chain.invoke(QUESTION)

In [13]:
len(answer['context'])

4

# Writing to text in dir

In [14]:
with open(os.path.join(PATH_TO_DIR, "answer.txt"), mode="w") as f:
    f.write(answer['answer'])
with open(os.path.join(PATH_TO_DIR, "source.txt"), mode="w") as f:
    for d in answer['context']: 
        f.write("-"*15 + "\n")
        if (source:=d.metadata['source']): 
            f.write(f"souce: {source}")
        if (page:=d.metadata['page']):
            f.write(f"\tpage: {page}")
        f.write("\n\n" + d.page_content + "\n\n")