# Exploring again using llms to process data


In [1]:
%load_ext dotenv
%dotenv ../.env

## [pdf loader](https://python.langchain.com/v0.2/docs/how_to/document_loader_pdf/)
https://python.langchain.com/v0.2/docs/how_to/document_loader_pdf/

In [2]:
from langchain_community.document_loaders import PyPDFLoader



In [22]:

file_path = "./data/U-S-Bancorp-CCAR-2024-Disclosure_Final.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()
print(len(docs))

11


PyPDFLoader loads US bancorp ccar and splits it into smaller sections
* by default, load_and_split uses [RecursiveCharacterTextSplitter](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html#langchain_community.document_loaders.pdf.PyPDFLoader.load_and_split)


# making model to answer

In [10]:
# making model
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-3.5-turbo-0125")



In [12]:
from langchain_community.vectorstores import FAISS 
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = FAISS.from_documents(documents=splits, embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()



# making rag chain

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


In [17]:

system_prompt = (
    "You are an assistant for extracting tables from pdf. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Output your answer in csv format "
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

results = rag_chain.invoke({"input": "Output the table on loan losses"})

results



{'input': 'Output the table on loan losses',
 'context': [Document(page_content='Loan losses\nProjected loan losses, by type of loan, 2024:Q1-2026:Q1\nSupervisory Severely Adverse Scenario\nBillions of \ndollarsPortfolio loss\nLoan type rates (percent) (1)\nLoan losses 15.7 4.4%\n  First-lien mortgages, domestic 1.0 0.9%\n  Junior liens and HELOCs, (2) domestic 0.3 2.5%\n  Commercial and industrial (3)4.6 4.6%\n  Commercial real estate, domestic 3.4 7.1%\n  Credit cards 4.7 17.3%\n  Other consumer (4)0.9 3.6%\n  Other loans (5)0.8 2.8%\n(1)  Average loan balances used to calculate portfolio loss rates exclude loans held for sale, loans held for investment under the fair-value option, and  \n      Paycheck Protection Program loans and are calculated over nine quarters.\n(2)  Home equity lines of credit (HELOCs) .\n(3)  Commercial and industrial loans include small- and medium-enterprise loans and corporate cards.\n(4)  Other consumer loans include student loans and automobile loans.\n(5

In [14]:
results['answer']

'Loan Type, Projected Losses (Billions of dollars), Loss Rate (%)\nFirst-lien mortgages, domestic, 1.0, 0.9%\nJunior liens and HELOCs, domestic, 0.3, 2.5%\nCommercial and industrial, 4.6, 4.6%\nCommercial real estate, domestic, 3.4, 7.1%\nCredit cards, 4.7, 17.3%\nOther consumer, 0.9, 3.6%\nOther loans, 0.8, 2.8%'

In [16]:
with open('llm_pdf_to_csv.csv', 'w') as f:
    f.write(results['answer'])

## csv results
* this ended up missing the first line of the table because it was in a different format (bolded)t
* commas in the text messed up the csv format


# retry with json
* converting to json format

In [18]:
system_prompt = (
    "You are an assistant for extracting tables from pdf. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Output your answer in json format "
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

results = rag_chain.invoke({"input": "Output the table on loan losses"})

results

{'input': 'Output the table on loan losses',
 'context': [Document(page_content='Loan losses\nProjected loan losses, by type of loan, 2024:Q1-2026:Q1\nSupervisory Severely Adverse Scenario\nBillions of \ndollarsPortfolio loss\nLoan type rates (percent) (1)\nLoan losses 15.7 4.4%\n  First-lien mortgages, domestic 1.0 0.9%\n  Junior liens and HELOCs, (2) domestic 0.3 2.5%\n  Commercial and industrial (3)4.6 4.6%\n  Commercial real estate, domestic 3.4 7.1%\n  Credit cards 4.7 17.3%\n  Other consumer (4)0.9 3.6%\n  Other loans (5)0.8 2.8%\n(1)  Average loan balances used to calculate portfolio loss rates exclude loans held for sale, loans held for investment under the fair-value option, and  \n      Paycheck Protection Program loans and are calculated over nine quarters.\n(2)  Home equity lines of credit (HELOCs) .\n(3)  Commercial and industrial loans include small- and medium-enterprise loans and corporate cards.\n(4)  Other consumer loans include student loans and automobile loans.\n(5

In [19]:
with open('llm_pdf_to_text.txt', 'w') as f:
    f.write(results['answer'])

## json results
outputs in non standard format, though the information is present  
my concern is that this is not scalable since it would require additional post processing (which may be different for different pdfs)

# 2024 fed 

In [27]:

file_path = "./data/2024-dfast-results-20240626.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()

llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

vectorstore = FAISS.from_documents(documents=splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()




In [34]:

system_prompt = (
    "You are an assistant for extracting tables from pdf. "
    "Use the following pieces of retrieved context to answer "
    "the question. Output your answer in csv format seperated with semicolons."
    "make sure to include the column headers and the whole table"
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

results = rag_chain.invoke({"input": "Output the table 6 which is Capital ratios"})

results



{'input': 'Output the table 6 which is Capital ratios',
 'context': [Document(page_content='Table 6. Capital ratios, actual 2023:Q4 and projected 2024:Q1–2026:Q1 under the se verely adver se scenario: 31 banks\nPercent\nBankCommon equity\ntier 1 capital ratioTier 1 capital ratio Total capital ratio Tier 1 le verage ratioSupplementar y\nleverage ratio1\nActual\n2023:\nQ4EndingMini-\nmumActual\n2023:\nQ4EndingMini-\nmumActual\n2023:\nQ4EndingMini-\nmumActual\n2023:\nQ4EndingMini-\nmumActual\n2023:\nQ4EndingMini-\nmum\nAlly 9.47.17.010.8 8.58.512.410.110.1 8.76.86.8\nAmerican Expr ess 10.511.8 9.411.312.510.213.114.412.0 9.910.9 8.7\nBank of America 11.8 9.39.113.511.010.815.212.912.8 7.15.85.76.14.94.8\nBank of NY-Mellon 12.014.812.214.817.615.015.818.616.2 6.07.16.17.48.77.5\nBarclays US 13.711.1 9.515.112.611.016.914.513.0 8.57.06.06.05.04.3\nBMO 10.5 5.05.011.1 5.75.712.9 7.67.68.34.14.17.23.63.6\nCapital One 12.9 7.67.614.2 9.09.016.010.810.811.2 7.07.09.66.06.0\nCharles Sch wab Corp

In [35]:
print(results['answer'])

Bank; Common equity tier 1 capital ratio; Tier 1 capital ratio; Total capital ratio; Tier 1 leverage ratio; Supplementary leverage ratio
Ally; 9.4; 7.1; 7.0; 10.8; 8.5
American Express; 10.5; 11.8; 9.4; 11.3; 12.5
Bank of America; 11.8; 9.3; 9.1; 13.5; 11.0
Bank of NY-Mellon; 12.0; 14.8; 12.2; 14.8; 17.6
Barclays US; 13.7; 11.1; 9.5; 15.1; 12.6
BMO; 10.5; 5.0; 5.0; 11.1; 5.7
Capital One; 12.9; 7.6; 7.6; 14.2; 9.0
Charles Schwab Corp; 24.5; 27.3; 25.3; 31.7; 34.5


In [36]:
with open('llm_pdf_to_csv.csv', 'a') as f:
    f.write(results['answer'])

## llm results
with the larger document, it is having trouble getting the full table  
this may be an issue with the text splitter