In [5]:
import pandas as pd
import re
from tqdm.notebook import tqdm
from datasets import Dataset
import numpy as np

In [6]:
df = pd.read_csv('../dataset/bullet-classification-v2.csv', index_col=0)
df.loc[:, 'label'] = df.prediction
df = df.dropna(axis=1,how='all')

In [7]:
entries = []
progress = tqdm(total=len(df))
for pdf_name, pdf_df in df.groupby(['pdf_name']):
    new_entry = False
    new_entry_index = []
    for index, row in pdf_df.iterrows():
        progress.update()
        if row.prediction == 'START':
            new_entry_index = []
            new_entry_index.append(index)
            new_entry = True
        elif row.prediction == 'END':
            if new_entry_index:
                new_entry_index.append(index)
                entries.append(new_entry_index)
            new_entry = False
            new_entry_index = []
        elif row.prediction == 'START_AND_END':
            new_entry_index = []
            entries.append([index])
            new_entry = False
        elif new_entry:
            new_entry_index.append(index)

  0%|          | 0/3351 [00:00<?, ?it/s]

# Per Line

In [636]:
def split_text(text, splits, names):
    # preprocess
    indexes = [0, len(text)]
    for split in splits:
        indexes.extend(split)
    names_index = {e: name for (s,e), name in zip(splits, names)}
    
    indexes = list(set(indexes))
    indexes.sort()
    pointer = 0
    tokens = []
    token_name = []
    for end in indexes[1:]:
        sub_string = text[pointer:end]
        for i, sub_token in enumerate(sub_string.split()):
            tokens.append(sub_token)
            token_name.append(names_index.get(end))
        pointer = end
    return tokens,token_name

In [637]:
rows = {}
for entry in entries:
    bullet_found = False
    entry_found = False
    for idx, row in df.loc[entry].iterrows():
        text = row.text
        
        splits = []
        names = []
        amount = re.findall(r'([\d,]+) ?บาท', text)

        if amount:
            amount = amount[0]
            amount_span = (text.index(amount),
                           text.index(amount) + len(amount))
            splits.append(amount_span)
            names.append('amount')
        
        if row.label != 'NOT_START_OR_END':
            bullet = re.match(r'([\d\-\.]+ ?|\([\d\.]+\) ?|[ก-ฮ]\. ?)+', text)
            if bullet is not None:
                bullet_span = bullet.span()
                splits.append(bullet_span)
                names.append('bullet')

        tokens, names = split_text(text, splits, names)
        new_name_tags = names.copy()
        amount_found = False
        for i, name in enumerate(names):
                if name == 'bullet':
                    new_name_tags[i] = 'B-BULLET' if not bullet_found else 'I-BULLET'
                    bullet_found = True
                elif not bullet_found and (tokens[i].startswith('งาน') or tokens[i].startswith('ด้าน')):
                    new_name_tags[i] = 'B-ENTRY'
                    entry_found = True
                    bullet_found = True
                elif bullet_found and name != 'amount' and not amount_found:
                    new_name_tags[i] = 'B-ENTRY' if not entry_found else 'I-ENTRY'
                    entry_found = True
                elif name == 'amount':
                    new_name_tags[i] = 'B-AMOUNT'
                    amount_found = True
                elif (row.label in ['START_AND_END','START']) and 'bullet' not in names and not amount_found:
                    new_name_tags[i] = 'I-ENTRY'
                elif name is None:
                    new_name_tags[i] = 'O'
        rows[idx] = {'tokens': tokens, 'names': new_name_tags}

entry_indexes = [idx for entry in entries for idx in entry]
for idx, row in df[~df.index.isin(entry_indexes)].iterrows():
    tokens,names = split_text(row.text,[],[])
    rows[idx] = {'tokens': tokens, 'names': ['O']*len(names)}

In [638]:
data={key: [val[key] for val in rows.values()] for key in ['tokens', 'names']}
named_df = pd.DataFrame(data, index = rows.keys())
named_df = df.join(named_df)

## Create 🤗 Dataset

In [640]:
from datasets import ClassLabel, Sequence

In [653]:
ds = Dataset.from_pandas(named_df[['text','tokens','names','label','pdf_name','line_num', 'page']])
class_label = ClassLabel(names=['O', 'B-AMOUNT', 'I-AMOUNT', 'B-BULLET', 'I-BULLET', 'B-ENTRY','I-ENTRY',])
ds = ds.cast_column('names', Sequence(feature=class_label))

Casting the dataset:   0%|          | 0/4 [00:00<?, ?ba/s]

In [654]:
ds.push_to_hub('bkk-budget-ner-line', private=True,)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/797 [00:00<?, ?B/s]

Updating downloaded metadata with the new split.


# Per Entry
## Create

In [643]:
entry_concated = {'tokens': [], 'ner_tags': []}

for entry in entries:
    label_tags = [rows[idx]['names'].copy() for idx in entry]
    text_tokens = [rows[idx]['tokens'].copy() for idx in entry]
    if len(entry) > 1:
        text_tokens[0][-1] += '\n'
    entry_concated['tokens'].append([tok for sent in text_tokens for tok in sent])
    entry_concated['ner_tags'].append([tag for sent in label_tags for tag in sent])

is_entry_count  = len(entry_concated['tokens'])
train_ratio = 0.85
train_set = list(range(0,int(is_entry_count*train_ratio)))

for idx, row in df[~df.index.isin(entry_indexes)].iterrows():
    tokens,names = split_text(row.text,[],[])
    entry_concated['tokens'].append(tokens)
    entry_concated['ner_tags'].append(['O']*len(names))

train_set += list(range(is_entry_count, int(len(entry_concated['tokens'])*train_ratio)))
test_set = [i for i in range(len(entry_concated['tokens'])) if i not in train_set]

In [644]:
len(train_set),len(entry_concated['tokens'])

(2317, 3038)

In [645]:
def create_tag_class(tags):
    name_tags = []
    for tags in tags:
        name_tags.extend(tags)
    name_tags = list(set(name_tags))
    return sorted(name_tags)

create_tag_class(entry_concated['ner_tags'])

['B-AMOUNT', 'B-BULLET', 'B-ENTRY', 'I-BULLET', 'I-ENTRY', 'O']

## Dataset

In [619]:
from datasets import ClassLabel, Sequence, DatasetDict

In [620]:
train_set_df=pd.DataFrame(entry_concated).loc[train_set]
test_set_df=pd.DataFrame(entry_concated).loc[test_set]

In [650]:
def get_ds(df):
    ds = Dataset.from_pandas(df)
    ds = ds.cast_column('ner_tags',
                        Sequence(feature=ClassLabel(
                            names=['O', 'B-AMOUNT', 'I-AMOUNT', 'B-BULLET', 'I-BULLET', 'B-ENTRY','I-ENTRY',])))
    return ds

In [652]:
ds = DatasetDict({'train': get_ds(train_set_df), 'test': get_ds(test_set_df)})
ds.push_to_hub('bkk-budget-named-tokens-concat', private=True,)

Casting the dataset:   0%|          | 0/3 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/692 [00:00<?, ?B/s]

# Only `Entry` tag
## create

In [50]:
new_line_token  = '[BEG_LINE]'
end_line_token  = '[END_LINE]'
begin_doc_token = '[BEGIN_DOC]'

In [51]:
df = pd.read_csv('../dataset/bullet-classification-v2.csv', index_col=0)
df.loc[:, 'label'] = df.prediction
df = df.dropna(axis=1,how='all')

In [52]:
entries = []
progress = tqdm(total=len(df))
for pdf_name, pdf_df in df.groupby(['pdf_name']):
    new_entry = False
    new_entry_index = []
    for index, row in pdf_df.iterrows():
        progress.update()
        if row.prediction == 'START':
            new_entry_index = []
            new_entry_index.append(index)
            new_entry = True
        elif row.prediction == 'END':
            if new_entry_index:
                new_entry_index.append(index)
                entries.append(new_entry_index)
            new_entry = False
            new_entry_index = []
        elif row.prediction == 'START_AND_END':
            new_entry_index = []
            entries.append([index])
            new_entry = False
        elif new_entry:
            new_entry_index.append(index)
entry_indexes = [i for ent in entries for i in ent]

  0%|          | 0/3351 [00:00<?, ?it/s]

In [53]:
def split_tokens(idx, text, label):
    tokens = [new_line_token] + text.split() + [end_line_token]
    if idx not in entry_indexes:
        name_tags = ['O']*len(tokens)
        return {'tokens':tokens, 'name_tags': name_tags}

    name_tags = [None]*len(tokens)
    for i, token in enumerate(tokens):
        if label in ['START', 'START_AND_END']:
            name_tags[i] = 'I-ENTRY' if i else 'B-ENTRY'
        else:
            name_tags[i] = 'I-ENTRY'

    return {'tokens':tokens, 'name_tags': name_tags}

token_tag_df = df.apply(lambda x: pd.Series(split_tokens(x.name,x.text,x.label)), axis=1)
token_tag_df = df.join(token_tag_df)

### Version 3

In [54]:
df = pd.read_csv('../dataset/bullet-classification-v3.csv', index_col=0)
df = df.dropna(axis=1,how='all')
df.tag.unique()

array(['O', 'B-ENTRY', 'I-ENTRY', nan], dtype=object)

In [55]:
df.loc[669, 'tag'] = 'O'
df.tag.unique()

array(['O', 'B-ENTRY', 'I-ENTRY'], dtype=object)

In [59]:
def __():
    def split_text(row):
        text = [new_line_token] + row.text.split() + [end_line_token]
        if row.tag == 'B-ENTRY':
            tags = [row.tag] + ['I-ENTRY'] * (len(text) - 1)
        else:
            tags = [row.tag] * len(text)

        return pd.Series({'tokens': text, 'name_tags': tags})
    
    return df.join(df.apply(split_text, axis=1))
token_tag_v3_df = __()

In [60]:
data = {'tokens': [],'ner_tags': []}
# preprocess
for group, page_df in pd.concat([token_tag_df,token_tag_v3_df])\
                            .groupby(['pdf_name', 'page']):

    token_in_page = page_df['tokens'].values.tolist()
    name_tags = page_df['name_tags'].values.tolist()
    
    tokens    = [begin_doc_token] + [token for line in token_in_page for token in line]
    name_tags = ['O'] + [tag for line in name_tags for tag in line]

    assert len(tokens) == len(name_tags)
    data['tokens'].append(tokens)
    data['ner_tags'].append(name_tags)

In [61]:
from transformers import AutoTokenizer, AddedToken

tokenizer = AutoTokenizer.from_pretrained("Geotrend/bert-base-th-cased")
tokenizer.add_special_tokens({"additional_special_tokens": [new_line_token, end_line_token, begin_doc_token]})

3

In [62]:
data

{'tokens': [['[BEGIN_DOC]',
   '[BEG_LINE]',
   'ข้อบัญญัติกรุงเทพมหานคร',
   '[END_LINE]',
   '[BEG_LINE]',
   'เรื่อง',
   'งบประมาณรายจ่ายประจำปีงบประมาณ',
   'พ.ศ.',
   '2561',
   '[END_LINE]',
   '[BEG_LINE]',
   'โดยที่เป็นการสมควรมีข้อบัญญัติกรุงเทพมหานคร',
   'เรื่อง',
   'งบประมาณรายจ่ายประจำปี',
   '[END_LINE]',
   '[BEG_LINE]',
   'งบประมาณ',
   'พ.ศ.',
   '2561',
   '[END_LINE]',
   '[BEG_LINE]',
   'อาศัยอำนาจตามความในมาตรา',
   '97',
   'และมาตรา',
   '103',
   'แห่งพระราชบัญญัติระเบียบบริหาร',
   '[END_LINE]',
   '[BEG_LINE]',
   'ราชการกรุงเทพมหานคร',
   'พ.ศ.',
   '2528',
   'กรุงเทพมหานคร',
   'โดยความเห็นชอบของสภากรุงเทพมหานคร',
   '[END_LINE]',
   '[BEG_LINE]',
   'จึงตราข้อบัญญัติกรุงเทพมหานครขึ้นไว้',
   'ดังต่อไปนี้',
   '[END_LINE]',
   '[BEG_LINE]',
   'ข้อ',
   '1',
   'ข้อบัญญัติกรุงเทพมหานครนี้เรียกว่า',
   '“ข้อบัญญัติกรุงเทพมหานคร',
   'เรื่อง',
   'งบประมาณ',
   '[END_LINE]',
   '[BEG_LINE]',
   'รายจ่ายประจำปีงบประมาณ',
   'พ.ศ.',
   '2561”',
   '[END_LI

In [63]:
from datasets import Sequence, ClassLabel
ds=Dataset.from_dict(data)
ds=ds.cast_column('ner_tags', Sequence(feature=ClassLabel(names=['O','B-ENTRY','I-ENTRY'])))
print(ds)
ds.train_test_split().push_to_hub('bkk-budget-ner-page',)

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushing split train to the Hub.


Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 630
})


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [38]:
list(zip(ds[100]['tokens'],ds[100]['ner_tags']))[:10]

[('[BEGIN_DOC]', 0),
 ('[BEG_LINE]', 0),
 ('185', 0),
 ('[END_LINE]', 0),
 ('[BEG_LINE]', 1),
 ('07102-1', 2),
 ('(3)', 2),
 ('ค่าใช้จ่ายในการสนับสนุนการดำเนินงานของ', 2),
 ('[END_LINE]', 2),
 ('[BEG_LINE]', 2)]

In [23]:
tokenized_input = tokenizer(data['tokens'][0], is_split_into_words=True, truncation=True)
tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])[:20]

NameError: name 'tokenizer' is not defined

# Human labeling

In [39]:
import json
import random

In [40]:
def get_data():
    data = None
    with open('../dataset/project-7-at-2022-12-28-08-41-7e221725.json') as fp:
        data=json.load(fp)
    return data

In [41]:
def split_text(text, splits, names):
    # preprocess
    indexes = [0, len(text)]
    for split in splits:
        indexes.extend(split)
    names_index = {e: name for (s,e), name in zip(splits, names)}
    
    indexes = list(set(indexes))
    indexes.sort()
    pointer = 0
    tokens = []
    token_name = []

    split_re = r'(\s|\[BEGIN_DOC\]|\[END_LINE\])'
    for end in indexes[1:]:
        sub_string = text[pointer:end]
        tag = names_index.get(end)
        for i, sub_token in enumerate(re.split(split_re, sub_string)):
            if sub_token.strip() == '': continue
            tokens.append(sub_token)
            
            if tag is not None:
                token_name.append('I-'+tag if i else 'B-'+tag)
            else:
                token_name.append('O')
        pointer = end
    assert len(tokens) == len(token_name)
    return tokens, token_name

In [42]:
def get_dataset():
    tokens_list = []
    ner_tags_list = []
    for example in get_data():
        splits = []
        ner_tags = []
        if 'label' in example.keys():
            splits = [(label['start'],label['end']) for label in example['label']]
            ner_tags = [label['labels'][0] for label in example['label']]
        tokens, token_tags = split_text(example["text"], splits, ner_tags)
        
        tokens_list.append(tokens)
        ner_tags_list.append(token_tags)
    return {'tokens': tokens_list, 'ner_tags': ner_tags_list}

In [43]:
len(data['tokens']),len(data['ner_tags'])

(630, 630)

In [44]:
idx = random.randint(0,len(LS_dataset['tokens']))
list(zip(LS_dataset['tokens'][idx], LS_dataset['ner_tags'][idx]))[:100]

NameError: name 'LS_dataset' is not defined

In [153]:
LS_dataset = get_dataset()
for key in LS_dataset:
    data[key].extend(LS_dataset[key])

In [155]:
len(data['tokens']),len(data['ner_tags'])

(552, 552)

In [158]:
hf_ds=Dataset.from_dict(data)
hf_ds=hf_ds.cast_column('ner_tags', Sequence(feature=ClassLabel(names=['O','B-ENTRY','I-ENTRY'])))

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [163]:
hf_ds.train_test_split(0.2).push_to_hub('bkk-budget-ner-page',)

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/569 [00:00<?, ?B/s]

# Rule based

In [79]:
import pandas as pd

In [175]:
df = pd.read_csv('../dataset/rule_based_entry.csv', index_col=0)

In [151]:
df['ner_tag'] = 'O'
for name, entry_df in df[~df.entry_label.isna()].groupby('entry_label'):
    df.loc[entry_df.index, 'ner_tag'] = ['B-ENTRY'] + ['I-ENTRY'] * (len(entry_df) - 1)

In [152]:
page_list = list()
for name, page_df in df.groupby(['pdf', 'pagenum']):
    page_name = name[0]+'_'+str(name[1])
    page_series = page_df\
        .groupby('line_label')\
        .apply(lambda x: pd.Series({
            'tokens':x.fix_text.values.tolist() + ['[LINE_END]'],
            'ner_tags': x.ner_tag.values.tolist() + [x.ner_tag.values.tolist()[-1]],
            'page': page_name
        }))
    page_list.append(page_series)

In [174]:
i=random.randint(0,len(page_list))
print(i)
page_list[i]

440


Unnamed: 0_level_0,tokens,ner_tags,page
line_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"[60, [LINE_END]]","[O, O]",pdf/64/topic2787_2020_10_06_10_10_30.pdf_59
2,"[0310042-62-37, 2.20, โครงการก่อสร้างโรงพยาบาล...","[B-ENTRY, I-ENTRY, I-ENTRY, I-ENTRY]",pdf/64/topic2787_2020_10_06_10_10_30.pdf_59
3,"[วัตถุประสงค์, [LINE_END]]","[I-ENTRY, I-ENTRY]",pdf/64/topic2787_2020_10_06_10_10_30.pdf_59
4,"[-, เพื่อให้ประชาชนได้รับบริการทางการแพทย์ที่ค...","[I-ENTRY, I-ENTRY, I-ENTRY]",pdf/64/topic2787_2020_10_06_10_10_30.pdf_59
5,"[ทั้งทางด้านการรักษาพยาบาล, การส่งเสริมสุขภาพ,...","[I-ENTRY, I-ENTRY, I-ENTRY]",pdf/64/topic2787_2020_10_06_10_10_30.pdf_59
6,"[การป้องกันโรค, และการฟื้/นฟูสุขภาพที่มีประสิท...","[I-ENTRY, I-ENTRY, I-ENTRY]",pdf/64/topic2787_2020_10_06_10_10_30.pdf_59
7,"[-, เพื่อส่งเสริมและพัฒนาคุณภาพชีวิตของประชาชน...","[I-ENTRY, I-ENTRY, I-ENTRY]",pdf/64/topic2787_2020_10_06_10_10_30.pdf_59
8,"[คลองสามวาและพื้นที่ใกล้เคียง, ได้แก่, เขตหนอง...","[I-ENTRY, I-ENTRY, I-ENTRY, I-ENTRY]",pdf/64/topic2787_2020_10_06_10_10_30.pdf_59
9,"[เขตคันนายาว, เขตบางเขน, เขตมีนบุรี, เขตสายไหม...","[I-ENTRY, I-ENTRY, I-ENTRY, I-ENTRY, I-ENTRY]",pdf/64/topic2787_2020_10_06_10_10_30.pdf_59
10,"[-, เพื่อเพิ่มโอกาสและขยายการให้บริการด้านการแ...","[I-ENTRY, I-ENTRY, I-ENTRY]",pdf/64/topic2787_2020_10_06_10_10_30.pdf_59


In [173]:
page_list[721].loc[4,'ner_tags'] = ['B-ENTRY', 'I-ENTRY', 'I-ENTRY', 'I-ENTRY']
page_list[250].loc[2:3]

Unnamed: 0_level_0,tokens,ner_tags,page
line_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,"[05313-4, (4), ปรับปรุงซอยหทัยราษฎร์, 50, [LIN...","[O, O, O, O, O]",pdf/62/A20181002170238.pdf_21
3,"[จากถนนหทัยราษฎร์ถึงสุดระยะที่กำหนดให้, 17,419...","[O, O, O, O]",pdf/62/A20181002170238.pdf_21


In [147]:
page_series.to_dict(orient='list');

In [91]:
pd.concat(page_list)

Unnamed: 0_level_0,tokens,ner_tags,page
line_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"[19, [LINE_END]]","[O, O]",pdf/61/A20171003161804.pdf_19
2,"[1.5, ภาษีบำรุงกรุงเทพมหานครสำหรับน้ำมันฯ, จำน...","[O, O, O, O, O, O]",pdf/61/A20171003161804.pdf_19
3,"[ก., ประมาณการและรายรับจริง, [LINE_END]]","[O, O, O]",pdf/61/A20171003161804.pdf_19
4,"[(หน่วย, :, ล้านบาท), [LINE_END]]","[O, O, O, O]",pdf/61/A20171003161804.pdf_19
5,"[+, เปรียบเทียบกับ, [LINE_END]]","[O, O, O]",pdf/61/A20171003161804.pdf_19
...,...,...,...
28,"[320,500, บาท, [LINE_END]]","[I-ENTRY, I-ENTRY, I-ENTRY]",pdf/65/65077.pdf_16
29,"[01205-1, เงินเพิ่มค่าครองชีพชั่วคราวของพนักงา...","[B-ENTRY, I-ENTRY, I-ENTRY]",pdf/65/65077.pdf_16
30,"[27,000, บาท, [LINE_END]]","[I-ENTRY, I-ENTRY, I-ENTRY]",pdf/65/65077.pdf_16
31,"[01206-1, เงินช่วยเหลือค่าครองชีพของพนักงาน, [...","[B-ENTRY, I-ENTRY, I-ENTRY]",pdf/65/65077.pdf_16
