In [20]:
%load_ext dotenv
%dotenv ../.env

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [21]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
llm = ChatOpenAI(model="gpt-4o-mini")
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [19]:
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import os
file_loader = PyPDFLoader
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) 

PATH_TO_PDFS = './reports/24Q1USBANK'
INDEX_PATH = os.path.join(PATH_TO_PDFS, 'faiss_index') 
if(not os.path.exists(INDEX_PATH)):
    docs = []
    for f in os.listdir(PATH_TO_PDFS):
        dir_path = os.path.join(PATH_TO_PDFS, f)
        if os.path.isfile(dir_path):
                loader = file_loader(dir_path)
        docs.extend(loader.load())
    splits = text_splitter.split_documents(docs)
    vectorstore = FAISS.from_documents(splits, embedding_model)
    vectorstore.save_local(INDEX_PATH)
else:
    vectorstore = FAISS.load_local(INDEX_PATH, embedding_model, allow_dangerous_deserialization=True)

In [22]:
from langchain_core.pydantic_v1 import BaseModel, Field, validator

In [23]:
class BalanceSheet(BaseModel):
    """
    Fields to extract from balance sheet 
    """
    total_assets: float = Field(description="total assets")
    cash: float = Field(description="cash and cash equivalents")
    acc_rec: float = Field(description="accounts receivable")
    inv: float = Field(description="inventory")
    capital_assets : float = Field(description="Property, plant, equipment, and things that fall under capital assets")
    intangible_assets : float = Field(description="intangible assets")
    other_assets : float = Field(description="any asset that does not fall under any other asset category")

    total_liabilities : float = Field(description="total liabilities")
    acc_payable : float = Field(description="accounts payable")
    notes_payable : float = Field(description="Short term debt and notes payable")
    longterm_debt : float = Field(description="long term debt")
    other_liabilities : float = Field(description="other liabilites")

    total_equity : float = Field(description="total equity")
    common_stock : float = Field(description="common stock")
    prefered_stock : float = Field(description="prefered stock")
    retained_earnings : float = Field(description="retained_earnings")
    other_equity : float = Field(description="other equity")
    

# with parser

In [7]:
from langchain.output_parsers import PydanticOutputParser

In [8]:
parser = PydanticOutputParser(pydantic_object=BalanceSheet)
len(parser.get_format_instructions())

2499

In [9]:
from langchain_core.prompts import PromptTemplate


In [10]:
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)


#  open ai tools
https://python.langchain.com/v0.1/docs/modules/model_io/output_parsers/types/openai_tools/

In [None]:
llm = ChatOpenAI(model="gpt-4o-mini")
llm = llm.bind_tools([BalanceSheet])
llm.kwargs['tools']

In [25]:
from langchain_core.prompts import ChatPromptTemplate

In [26]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
llm = ChatOpenAI(model="gpt-4o-mini")
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [52]:
import base64
with open(r".\reports\24Q1USBANK\balance_sheet.png", "rb") as img: 
    image_data = base64.b64encode(img.read()).decode('utf-8')

In [53]:
from langchain.schema.messages import HumanMessage, AIMessage

In [54]:
message = HumanMessage(
    content=[
        {"type": "text", "text": "extract the column for March 31, 2023"},
        {
            "type": "image_url",
            "image_url": {"url": f"data:image/png;base64,{image_data}"},
        },
    ],
)


In [55]:
response = llm.invoke([message])

In [56]:
print(response.content)

Here is the extracted column for March 31, 2023:

**Assets**
- Cash and due from banks: $76,985
- Investment securities: 
  - Held-to-maturity: $82,948
  - Available-for-sale: $72,426
- Loans held for sale: $2,080
- Loans:
  - Commercial: $134,726
  - Commercial real estate: $52,677
  - Residential mortgages: $116,079
  - Credit card: $27,844
  - Other retail: $43,262
  - Total loans: $374,588
  - Less allowance for loan losses: ($7,514)
  - Net loans: $367,074
- Premises and equipment: $3,537
- Goodwill: $12,479
- Other intangible assets: $6,031
- Other assets: $60,046
- **Total assets: $683,606**

**Liabilities and Shareholders' Equity**
- Deposits:
  - Noninterest-bearing: $91,220
  - Interest-bearing: $436,843
  - Total deposits: $528,063
- Short-term borrowings: $17,102
- Long-term debt: $52,693
- Other liabilities: $29,715
- **Total liabilities: $627,573**

**Shareholders' equity**
- Preferred stock: $6,808
- Common stock: $21
- Capital surplus: $8,642
- Retained earnings: $74,47

works better if there is no other data\
additional data usually confuses it\
it cannot reliably do math, only ask it to transcribe clear data

# processing markdown

In [62]:
import re

In [63]:
md_resp = response.content
my_dict = {}


In [105]:
print(md_resp)

Here is the extracted column for March 31, 2023:

**Assets**
- Cash and due from banks: $76,985
- Investment securities: 
  - Held-to-maturity: $82,948
  - Available-for-sale: $72,426
- Loans held for sale: $2,080
- Loans:
  - Commercial: $134,726
  - Commercial real estate: $52,677
  - Residential mortgages: $116,079
  - Credit card: $27,844
  - Other retail: $43,262
  - Total loans: $374,588
  - Less allowance for loan losses: ($7,514)
  - Net loans: $367,074
- Premises and equipment: $3,537
- Goodwill: $12,479
- Other intangible assets: $6,031
- Other assets: $60,046
- **Total assets: $683,606**

**Liabilities and Shareholders' Equity**
- Deposits:
  - Noninterest-bearing: $91,220
  - Interest-bearing: $436,843
  - Total deposits: $528,063
- Short-term borrowings: $17,102
- Long-term debt: $52,693
- Other liabilities: $29,715
- **Total liabilities: $627,573**

**Shareholders' equity**
- Preferred stock: $6,808
- Common stock: $21
- Capital surplus: $8,642
- Retained earnings: $74,47

In [106]:
my_str = "- Cash and due from banks: $76,985"
my_str2 = "- Less allowance for loan losses: ($7,514)"
my_regex = r'\s*-[\s*]*(?P<name>[A-Za-z][A-Za-z-\s]+)\s*:\s*((\$(?P<pos_val>[\d,]+))|(\(\$(?P<neg_val>[\d,]+)\)))[\s*]*'

In [72]:
pattern = re.compile(my_regex)

In [73]:
my_match = pattern.match(my_str)

In [74]:
type(my_match)

NoneType

In [97]:
m = re.match(my_regex, my_str2)
m.groupdict()

{'name': 'Less allowance for loan losses', 'pos_val': None, 'neg_val': '7,514'}

In [107]:
my_dict = {}
for l in md_resp.split("\n"):
    match = re.match(my_regex, l)
    if not match :
        continue
    groupdict = match.groupdict()
    if pv:= groupdict["pos_val"]:
        my_dict[groupdict['name']] = float(pv.replace(',',''))
    elif nv_str:= groupdict['neg_val']:
        my_dict[groupdict['name']] = -1 * float(nv_str.replace(',',''))

In [108]:
print(my_dict)

{'Cash and due from banks': 76985.0, 'Held-to-maturity': 82948.0, 'Available-for-sale': 72426.0, 'Loans held for sale': 2080.0, 'Commercial': 134726.0, 'Commercial real estate': 52677.0, 'Residential mortgages': 116079.0, 'Credit card': 27844.0, 'Other retail': 43262.0, 'Total loans': 374588.0, 'Less allowance for loan losses': -7514.0, 'Net loans': 367074.0, 'Premises and equipment': 3537.0, 'Goodwill': 12479.0, 'Other intangible assets': 6031.0, 'Other assets': 60046.0, 'Total assets': 683606.0, 'Noninterest-bearing': 91220.0, 'Interest-bearing': 436843.0, 'Total deposits': 528063.0, 'Short-term borrowings': 17102.0, 'Long-term debt': 52693.0, 'Other liabilities': 29715.0, 'Total liabilities': 627573.0, 'Preferred stock': 6808.0, 'Common stock': 21.0, 'Capital surplus': 8642.0, 'Retained earnings': 74473.0, 'Less treasury stock': -24023.0, 'Noncontrolling interests': 465.0, 'Total equity': 56033.0, 'Total liabilities and equity': 683606.0}


In [109]:
import json

In [116]:
with open("./reports/24Q1USBANK/balancesheet.json", "w+") as f:
    json.dump(my_dict, f)

# forcing json output ?