In [4]:
from typing import Optional

from pydantic import BaseModel, Field


class Transaction(BaseModel):
    year: Optional[int] = Field(default=None, description="Year of the transaction")
    month: Optional[int] = Field(default=None, description="Month of the transaction")
    bank: Optional[str] = Field(default=None, description="Bank of the transaction")
    description: Optional[str] = Field(
        default=None, description="Description of the transaction"
    )
    amount: Optional[float] = Field(
        default=None, description="Amount of the transaction"
    )


class Transactions(BaseModel):
    transactions: list[Transaction]

In [2]:
import re
from pathlib import Path

inter = Path.cwd() / "database" / "inter.csv"
inter_content: str = re.sub(r"\s{2}+", "", inter.read_text().strip().lower())

total_lines = inter_content.count("\n")
half_size = total_lines // 2
first_half = "\n".join(inter_content.split("\n")[:half_size])
second_half = "\n".join(inter_content.split("\n")[half_size:])
total = [first_half, second_half]
print(f"Total lines: {total_lines}")
print(f"Lines in each half: {half_size}")

Total lines: 61
Lines in each half: 30


In [66]:
import os

from langchain_core.prompts import ChatPromptTemplate
from langchain_mistralai import ChatMistralAI

MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY")

if not MISTRAL_API_KEY:
    raise ValueError("Missing MISTRAL_API_KEY environment variable")

prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an assistant in charge of extracting information from a bank statement written in Portuguese. "
            "Only extract information about the following attributes: year, month, bank, description, amount. "
            "Each transaction will have a specific date and will be either an expense (D or a negative value) or income (C or a positive value). "
            "For expenses use a negative amount and for incomes use a positive amount. "
            "There will be a single bank per statement. "
            "Return a JSON object with the following structure: {example}. That is, a list of transactions, where each transaction is a dictionary with the attributes year, month, bank, description, and amount. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human", "{transactions}"),
    ]
)

llm = ChatMistralAI(name="mistral-large-latest", temperature=0)


def extract_transactions(llm, chunk: str) -> Transactions:
    output_example: str = "[{'month': <month>, 'year': <year>, 'bank': <bank>, 'description': <description>, 'amount': <amount>}]"

    prompt = prompt_template.invoke({"example": output_example, "transactions": chunk})

    response = llm.invoke(prompt)
    return response.content

In [67]:
formatted_batches = [extract_transactions(llm, content) for content in total]
formatted_batches

['[\n{\n"month": "12",\n"year": 2024,\n"bank": "Banco do Brasil",\n"description": "crédito evento b3 * prov * juros s/capital 38 bbas3",\n"amount": 5.70\n},\n{\n"month": "12",\n"year": 2024,\n"bank": "Banco do Brasil",\n"description": "crédito evento b3 * prov * dividendos 30 petr4",\n"amount": 46.55\n},\n{\n"month": "12",\n"year": 2024,\n"bank": "Banco do Brasil",\n"description": "crédito evento b3 * prov * dividendos 30 petr4",\n"amount": 15.79\n},\n{\n"month": "12",\n"year": 2024,\n"bank": "Banco do Brasil",\n"description": "pix enviado nicki samuel da silva chaves",\n"amount": -16.00\n},\n{\n"month": "12",\n"year": 2024,\n"bank": "Banco do Brasil",\n"description": "pix enviado silvia regina leal zabaleta",\n"amount": -600.00\n},\n{\n"month": "12",\n"year": 2024,\n"bank": "Banco do Brasil",\n"description": "crédito evento b3 * prov * dividendos 133goau4",\n"amount": 17.29\n},\n{\n"month": "12",\n"year": 2024,\n"bank": "Banco do Brasil",\n"description": "compra inter cel inter pre 10

In [68]:
import json

cleaned_transactions = []
for batch in formatted_batches:
    cleaned_transactions.extend(json.loads(batch))

cleaned_transactions

[{'month': '12',
  'year': 2024,
  'bank': 'Banco do Brasil',
  'description': 'crédito evento b3 * prov * juros s/capital 38 bbas3',
  'amount': 5.7},
 {'month': '12',
  'year': 2024,
  'bank': 'Banco do Brasil',
  'description': 'crédito evento b3 * prov * dividendos 30 petr4',
  'amount': 46.55},
 {'month': '12',
  'year': 2024,
  'bank': 'Banco do Brasil',
  'description': 'crédito evento b3 * prov * dividendos 30 petr4',
  'amount': 15.79},
 {'month': '12',
  'year': 2024,
  'bank': 'Banco do Brasil',
  'description': 'pix enviado nicki samuel da silva chaves',
  'amount': -16.0},
 {'month': '12',
  'year': 2024,
  'bank': 'Banco do Brasil',
  'description': 'pix enviado silvia regina leal zabaleta',
  'amount': -600.0},
 {'month': '12',
  'year': 2024,
  'bank': 'Banco do Brasil',
  'description': 'crédito evento b3 * prov * dividendos 133goau4',
  'amount': 17.29},
 {'month': '12',
  'year': 2024,
  'bank': 'Banco do Brasil',
  'description': 'compra inter cel inter pre 10gb men

In [70]:
transactions = Transactions(transactions=cleaned_transactions)
for transaction in transactions.transactions:
    transaction.bank = "Inter"

transactions

Transactions(transactions=[Transaction(year=2024, month=12, bank='Inter', description='crédito evento b3 * prov * juros s/capital 38 bbas3', amount=5.7), Transaction(year=2024, month=12, bank='Inter', description='crédito evento b3 * prov * dividendos 30 petr4', amount=46.55), Transaction(year=2024, month=12, bank='Inter', description='crédito evento b3 * prov * dividendos 30 petr4', amount=15.79), Transaction(year=2024, month=12, bank='Inter', description='pix enviado nicki samuel da silva chaves', amount=-16.0), Transaction(year=2024, month=12, bank='Inter', description='pix enviado silvia regina leal zabaleta', amount=-600.0), Transaction(year=2024, month=12, bank='Inter', description='crédito evento b3 * prov * dividendos 133goau4', amount=17.29), Transaction(year=2024, month=12, bank='Inter', description='compra inter cel inter pre 10gb mensal', amount=-30.0), Transaction(year=2024, month=12, bank='Inter', description='cashback inter pre 10gb mensal', amount=3.0), Transaction(year