In [None]:
import re
import io
from pathlib import Path
import json
import sys
import os
sys.path.append(os.path.abspath('..'))

import pandas as pd
import tabula
import pymupdf
from PIL import Image
from IPython.display import display

import nest_asyncio
from components.llm_blsheet_extractor import extract_balance_sheet, extract_text_from_pdf_with_llamaparse

nest_asyncio.apply()
BASE_DIR = Path.cwd().parent
PDF_PATH_3M = BASE_DIR / 'data/raw_annual_report/3M 2022 Annual Report_Updated.pdf'
PDF_PATH_IBM = BASE_DIR / 'data/raw_annual_report/IBM_Annual_Report_2023.pdf'

In [None]:
doc_3m = pymupdf.open(PDF_PATH_3M)
doc_ibm = pymupdf.open(PDF_PATH_IBM)

# 1. consolidated balance sheetのページ数の取得

### 成功: pymupdfでoutline抽出

In [None]:
def get_balance_sheet_page(doc):
    outline = doc.get_toc()
    balance_sheet_pages = []
    for idx, line in enumerate(outline):
        if 'balance sheet' in line[1].lower() and line[2] != -1:
            balance_sheet_start_page = line[2] - 1
            next_topic_start_page = outline[idx + 1][2] - 1
            balance_sheet_pages.extend(list(range(balance_sheet_start_page, next_topic_start_page)))
    return balance_sheet_pages

balance_sheet_pages_3m = get_balance_sheet_page(doc_3m)
balance_sheet_pages_ibm = get_balance_sheet_page(doc_ibm)

# 2. balance sheetの表抽出

### 失敗1: pymupdではtableを検出できず

In [None]:
bl_sheet_page = doc_3m[balance_sheet_pages_3m[0]]
tabs = bl_sheet_page.find_tables()
print(tabs.tables)

### 失敗2: tabulaではtableを検出できるが、カラム名が抽出できない等のミスが見られる

In [None]:
tabs = tabula.read_pdf(PDF_PATH_3M, lattice=False, pages=balance_sheet_pages_3m[0]+1)
print(tabs[0].to_markdown())

### 成功1: Llamaparseのmrakdown praserでは正確に表形式で抽出することに成功

In [None]:
def extract_markdown_table(content):
    table_lines = []
    in_table = False
    for line in content.splitlines():
        if line.startswith('|') and re.search(r'\|', line):  # Table line detection
            table_lines.append(line)
            in_table = True
        elif in_table and not line.startswith('|'):
            break  # Stop when leaving the table section
    return table_lines

def convert_markdown_table_to_df(table_lines):
    table_data = [line.strip('|').split('|') for line in table_lines]
    headers = table_data[0]
    data_rows = table_data[2:]
    df = pd.DataFrame(data_rows, columns=[header.strip() for header in headers])
    df = df.apply(lambda x: x.strip() if isinstance(x, str) else x)
    return df

extracted_text_llama = extract_text_from_pdf_with_llamaparse(PDF_PATH_3M, balance_sheet_pages_3m[0])
extracted_markdown_table = extract_markdown_table(extracted_text_llama)
print(convert_markdown_table_to_df(extracted_markdown_table).to_markdown())

### 成功2: Llama parseしたtextをさらにllmで構造化することに成功

In [None]:
extracted_balance_sheets = []
for page_num in balance_sheet_pages_3m:
    balance_sheet = extract_balance_sheet(PDF_PATH_3M, page_num, target_year=2022)
    extracted_balance_sheets.extend(json.loads(balance_sheet)['categories'])


with open (BASE_DIR / 'data/extracted_balance_sheet/3m_2022.json', 'w') as f:
    json.dump(extracted_balance_sheets, f, indent=2)

extracted_balance_sheets
