### Import Library

In [8]:
import fitz
import re
import json
import os
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

In [9]:
def get_text(input_path):
    doc = fitz.open(input_path)
    text = ""
    for page in doc:
        text += page.get_text()

    return text

In [10]:
def remove_space_redundant(text):
    words = text.split()
    clean_text = " ".join(words)
    return clean_text

In [11]:
def create_content_json(input_path, output_path):
    text = get_text(input_path)

    content_between_chapters = re.findall(r"(Chương \b(?:I{1,3}(?:V?X?)?|VI{0,3}|XI{0,3}V?|XVI{0,3})\b\.?)(.*?)(?=(Chương \b(?:I{1,3}(?:V?X?)?|VI{0,3}|XI{0,3}V?|XVI{0,3})\b\.? |$))", text, re.DOTALL)
    chapter_name = []
    content_chapter = []
    all_content_chapter = []    # extract từng chapter trước
    for content_between_chapter in content_between_chapters:
        chapter_name_temp = content_between_chapter[0].strip()
        content_chapter_temp = content_between_chapter[1].strip()
        chapter_name.append(chapter_name_temp.strip())
        content_chapter.append(content_chapter_temp.strip())
        all_content_chapter.append(content_between_chapter[0] + content_between_chapter[1])

    chapter_title = []
    rule_title = []
    contents = []
    regex_chapter = re.compile(r'(Chương \b(?:I{1,3}(?:V?X?)?|VI{0,3}|XI{0,3}V?|XVI{0,3})\b\.?)\s*(.*)')
    regex_rule = re.compile(r'(Điều \d+\.)(.*?)(?=(Điều \d+\. |$))', re.DOTALL)
    for content_chap in all_content_chapter:
        matches_chapter = regex_chapter.findall(content_chap)
        matches_rule = regex_rule.findall(content_chap)
        for match_rule in matches_rule:
            for match_chapter in matches_chapter:
                temp = match_chapter[0] + "\n" + match_chapter[1]
                chapter_title.append(temp.strip())
            temp_title_rule = match_rule[0]
            rule_title.append(temp_title_rule.strip())
            temp_content_rule = match_rule[1].strip()
            contents.append(temp_content_rule)

    titles = []
    for i in range(len(chapter_title)):
        titles.append(chapter_title[i] + ",\n" + rule_title[i])

    data = []
    for i in range(len(titles)):
        data.append({'title': titles[i], 'content': contents[i]})

    with open(output_path, "w", encoding="utf-8") as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

### Create content json

In [12]:
input_path = "luật_đất_đai_2024.pdf"
output_path = "luật_đất_đai_2024.json"
create_content_json(input_path, output_path)

# Create chunk json

In [15]:
def create_chunk_json(input_path, output_path):
    text = get_text(input_path)

    content_between_chapters = re.findall(r"(Chương \b(?:I{1,3}(?:V?X?)?|VI{0,3}|XI{0,3}V?|XVI{0,3})\b\.?)(.*?)(?=(Chương \b(?:I{1,3}(?:V?X?)?|VI{0,3}|XI{0,3}V?|XVI{0,3})\b\.? |$))", text, re.DOTALL)
    chapter_name = []
    content_chapter = []
    all_content_chapter = []    # extract từng chapter trước
    for content_between_chapter in content_between_chapters:
        chapter_name_temp = content_between_chapter[0].strip()
        content_chapter_temp = content_between_chapter[1].strip()
        chapter_name.append(chapter_name_temp.strip())
        content_chapter.append(content_chapter_temp.strip())
        all_content_chapter.append(content_between_chapter[0] + content_between_chapter[1])

    chapter_title = []
    rule_title = []
    contents = []
    regex_chapter = re.compile(r'(Chương \b(?:I{1,3}(?:V?X?)?|VI{0,3}|XI{0,3}V?|XVI{0,3})\b\.?)\s*(.*)')
    regex_rule = re.compile(r'(Điều \d+\.)(.*?)(?=(Điều \d+\. |$))', re.DOTALL)
    for content_chap in all_content_chapter:
        matches_chapter = regex_chapter.findall(content_chap)
        matches_rule = regex_rule.findall(content_chap)
        for match_rule in matches_rule:
            for match_chapter in matches_chapter:
                temp = match_chapter[0] + "\n" + match_chapter[1]
                chapter_title.append(temp.strip())
            temp_title_rule = match_rule[0] + match_rule[1].split('\n')[0].strip()
            rule_title.append(temp_title_rule.strip())
            temp_content_rule = remove_space_redundant(" ".join(match_rule[1].split('\n')[1:]).strip())
            contents.append(temp_content_rule)

    titles = []
    for i in range(len(chapter_title)):
        titles.append(os.path.basename(input_path).rstrip(".pdf").replace('_', ' ') + "\n" + chapter_title[i] + "\n" + rule_title[i])

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=64)

    title_chunk, chunks = [], []
    for i in range(len(contents)):
        chunk = text_splitter.split_text(contents[i])

        num = len(chunk)
        for k in range(num):
            title_chunk.append(titles[i])
        chunks.append(chunk)

    chunks = [item for chunk in chunks for item in chunk]

    data = []
    for i in range(len(title_chunk)):
        data.append({'title': title_chunk[i], 'context': chunks[i]})

    with open(output_path, "w", encoding="utf-8") as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

In [16]:
input_path = "luật_đất_đai_2024.pdf"
output_path = "luật_đất_đai_2024.json"
create_chunk_json(input_path, output_path)