In [53]:
import pandas as pd
import re
from os import listdir
from tqdm import tqdm
from anytree import NodeMixin, RenderTree, PreOrderIter

In [54]:
def get_patern_of_bullet(String):
    regx = [
        ('\d{5}-\d+$', 70),
        ('[1-9][0-9]*(\.[1-9][0-9]*)*\)$', 20),
        ('\(\d*(\.?\d*)*\)$', 50),
        ('[1-9][0-9]*(\.[1-9][0-9]*)+$', 2),
        ('[1-9][0-9]*\.$', 1)
    ]

    for r, l in regx:
        if re.match(r, String):
            if l in [2, 20, 50]:
                l = String.count('.') + l
            return r, l
    return '', 0

In [55]:
df = pd.read_csv('bkk-bud_cleaned.csv', index_col='index')
fpl_group = df.groupby(['filename', 'page', 'line_num'])

In [56]:
# tof = pd.read_csv('bkk-toFix-fixed.csv', index_col='index')
# df.loc[tof.index] = df.loc[tof.index].assign(text=tof.text)
# df.to_csv('bkk-bud_cleaned.csv')

In [57]:
bulletFlag = False
entry = []
entries = []

for i, group in tqdm(fpl_group):
    lineText = group.text.values
    if not i[2]:
        continue

    if (lineText[0].startswith('งาน') and i[2] <= 2) or (lineText[0].startswith('งาน:')):
        entries.append(('job', group.index.to_list()))
        continue

    if ' ' in group.iloc[0].text:
        bullet = get_patern_of_bullet(group.iloc[0].text.split(' ')[0])
    else:
        bullet = get_patern_of_bullet(group.iloc[0].text)

    lastBlock = ' '.join(lineText).split(' ')[-1]
    if lineText[0].startswith('งาน'):
        if (lastBlock == 'บาท'):
            lineText = lineText[:-2]
        job = ' '.join(lineText)
        continue
    elif bullet[1]:
        bulletFlag = True

    if bulletFlag:
        entry += group.index.to_list()
        if lastBlock == 'บาท':
            bulletFlag = False
            entries.append(('list', entry))
            entry = []


100%|██████████| 43112/43112 [00:17<00:00, 2434.82it/s]


In [58]:
class Budget(NodeMixin):
    def __init__(self, id, name, idx, bullet, parent=None, children=None):
        super(Budget, self).__init__()
        self.name = name
        self.id = id
        self.idx = idx
        self.amount = ''
        if len(idx) > 2:
            self.amount = df.loc[idx[-2]].text
        elif len(name.split(' ')) > 2:
            self.amount = name.split(' ')[-2]

        self.bullet = bullet
        self.parent = parent
        if children:
            self.children = children
    def __repr__(self):
        return self.name


In [59]:
BUDGET_RUNNING_ID = 0
root = Budget(BUDGET_RUNNING_ID, 'BKK2022', [-1], -2)
BUDGET_RUNNING_ID += 1

job_root = None
curr = None

for l, entry in tqdm(entries):
  entryText = ' '.join(df.loc[entry].text.values)

  if l == 'job':
    job_root = Budget(BUDGET_RUNNING_ID, entryText, entry, -1, root)
    BUDGET_RUNNING_ID += 1
    curr = job_root
    continue

  entry_bullet = get_patern_of_bullet(df.loc[entry[0]].text.split(' ')[0])
  if curr == job_root:
     curr = Budget(BUDGET_RUNNING_ID, entryText, entry, entry_bullet[1], job_root)
     BUDGET_RUNNING_ID += 1
  else:
    while curr.bullet != job_root and entry_bullet[1] <= curr.bullet:
      curr = curr.parent
    curr = Budget(BUDGET_RUNNING_ID, entryText, entry, entry_bullet[1], curr)
    BUDGET_RUNNING_ID+=1


100%|██████████| 28273/28273 [00:19<00:00, 1441.94it/s]


In [60]:
has_parent = pd.DataFrame(
    [
        {'id': node.id, 'parent': node.parent.id}
        for node in PreOrderIter(root) if not node.is_root
    ])

item = pd.DataFrame(
    [
        {
            'id': node.id,
            'text': node.name, 
            'amount': node.amount.replace(' ', '').replace(',', ''), 
            'filename': df.loc[node.idx[0]].filename,
            'page': df.loc[node.idx[0]].page,
            'line_num': df.loc[node.idx[0]].line_num
        }
        for node in PreOrderIter(root) if not node.is_root
    ]
)

has_parent.to_csv('2022bkk-buget-has_parent.csv', index=0)
item.to_csv('2022bkk-buget-item.csv', index=0)

In [65]:
item[item.amount.apply(lambda x: x.isdigit())==False]

Unnamed: 0,id,text,amount,filename,page,line_num
8578,8579,"1. งบบุคลากร 7,477,800.00 บาท",7477800.00,65041.pdf,29,2
8620,8621,"1. งบบุคลากร 4,345,600.00 บาท",4345600.00,65041.pdf,32,2
8674,8675,"1. งบบุคลากร 37,189,560.00 บาท",37189560.00,65041.pdf,36,2
8823,8824,"1. งบบุคลากร 2,492,680.00 บาท",2492680.00,65041.pdf,44,2
8871,8872,"1. งบบุคลากร 7,900,400.00 บาท",7900400.00,65041.pdf,47,2
8915,8916,"1. งบบุคลากร 3,666,540.00 บาท",3666540.00,65041.pdf,50,2
8964,8965,"1. งบบุคลากร 1,486,800.00 บาท",1486800.00,65041.pdf,53,2
9008,9009,"1. งบบุคลากร 9,078,000.00 บาท",9078000.00,65041.pdf,54,29
9373,9374,"1. งบบุคลากร 1,319,020.00 บาท",1319020.00,65042.pdf,42,2
9716,9717,"1. งบบุคลากร 7,003,500.00 บาท",7003500.00,65043.pdf,32,2
