In [1]:
import os
from os.path import join, dirname, abspath
import pandas as pd
import re
import requests
import fitz
import time
from typing import List
from bs4 import BeautifulSoup
from openai import AzureOpenAI
from markdownify import markdownify as md
from langchain.docstore.document import Document
from langchain.text_splitter import TokenTextSplitter
from langchain_openai import AzureOpenAIEmbeddings

from langchain_community.vectorstores.azuresearch import (
    AzureSearch,
    FIELDS_ID,
    FIELDS_CONTENT,
    FIELDS_CONTENT_VECTOR,
    FIELDS_METADATA,
)
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchableField,
    SearchField,
    SearchFieldDataType,
)
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
embeddings = AzureOpenAIEmbeddings(
    azure_endpoint=os.getenv("OPENAI_ENDPOINT"),
    azure_deployment=os.getenv("EMBEDDING_NAME"),
    chunk_size=20480,
    openai_api_key=os.getenv("OPENAI_KEY"),
    openai_api_version=os.getenv("OPENAI_API_VERSION"),
    openai_api_type="azure",
)

In [3]:
def extract_paragraphs(page: fitz.Page, tables: List[fitz.Rect]):
    page_words = page.get_text("words")
    paragraph_words = [
        w
        for w in page_words
        if not any([fitz.Rect(w[:4]).intersects(r) for r in tables])
    ]

    line_dict = {}
    paragraph_words.sort(key=lambda w: w[0])
    for w in paragraph_words:
        y1 = round(w[3], 1)
        word = w[4]
        line = line_dict.get(y1, [])
        line.append(word)
        line_dict[y1] = line
    lines = [line_dict[y1] for y1 in sorted(line_dict.keys())]
    return "\n".join([" ".join(line) for line in lines])


HTML_CLEANUP_RULES = [
    {
        "target": """Afbeelding met tekst

Automatisch gegenereerde beschrijving""",
        "replacement": "GENERATED_IMAGE_DESCRIPTION",
    },
    {
        "target": """Afbeelding met tabel

Automatisch gegenereerde beschrijving""",
        "replacement": "GENERATED_IMAGE_TABLE_DESCRIPTION",
    },
    {
        "target": """
 |""",
        "replacement": " |",
    },
]

MARKDOWN_CLEANUP_RULES = [
    {
        "target": """
 |""",
        "replacement": " |",
    },
    {
        "target": "\u200b",
        "replacement": "",
    },
    {
        "target": "\xa0",
        "replacement": " ",
    },
]


def pdf_to_html(html_string: str):
    """
    Extracts all paragraphs from a PDF page

    Inputs:
    page    A PyMuPDF page
    tables  A list of the bounding boxes of all tables on the page

    Returns:
    A string with all paragraphs of text on the page, separated by double newlines
    """
    soup = BeautifulSoup(html_string, "html.parser")
    # Filter and format the HTML lines
    filtered_lines = []
    for p_tag in sorted(
        soup.find_all("p"),
        key=lambda x: float(x["style"].split(";")[0].split(":")[1].replace("pt", "")),
    ):
        top_position = float(
            p_tag["style"].split(";")[0].split(":")[1].replace("pt", "")
        )
        left_position = float(
            p_tag["style"].split(";")[1].split(":")[1].replace("pt", "")
        )
        font_size = float(
            p_tag.find("span")["style"].split(";")[1].split(":")[1].replace("pt", "")
        )

        filtered_lines.append(
            {
                "top": top_position,
                "font_size": font_size,
                "content": str(p_tag),
            }
        )

    # format content from HTML as string
    formatted_html = ""
    for _, line in enumerate(filtered_lines):
        formatted_html += line["content"]

    return formatted_html


def sanitize_content(content: str, rules: dict) -> str:
    """
    Replaces content in the `content` string, using the rules specified.

    Inputs:
    content: the text to replace in
    rules: a dictionary of the rules to apply

    Outputs:
    The input string `content`, with all the rules specified in `rules` applied.
    """
    for target, replacement in rules:
        content = content.replace(target, replacement)
    return content


def convert_to_markdown(content: str) -> str:
    """
    Converts a HTML string to equivalent Markdown.

    Inputs:
    content     A text string with HTML

    Returns:
    A string containing the Markdown equivalent of the HTML string.
    """
    markdown = md(
        content,
        newline_style="<br>",
        heading_style="ATX",
        wraps=False,
        wrap_width=1000,
    )
    # Replace instancs of <text> with <<text>>, but not other combinations of angle brackets and/or text
    markdown = re.sub(r"((?<!<)<(?!<)(\w+)(?<!>)>(?!>))", r"\2", markdown)
    return markdown


def clean_markdown(markdown):
    """
    Preprocesses the Markdown text by performing several operations:

    1. Normalize text by lowercasing
    2. Replacing multiple newlines with a single newline character.
    3. Reducing multiple consecutive spaces to a single space.
    4. Replacing long URLs with hashes.
    5. Removing unnecessary symbols from the Markdown text.
    6. Removing stopwords from text.

    Args:
        markdown (str): The input Markdown text to be preprocessed.

    Returns:
        str: The preprocessed Markdown text.
    """
    markdown = markdown.lower()
    markdown = re.sub(r"\n\s*\n", " ", markdown)
    markdown = re.sub(r"[ ]+", " ", markdown)
    markdown = re.sub(r"([^\w\s\[\]\(\$\\.\n\/:#<>{},_\"!@\-*=])\1+", "", markdown)
    return markdown


def pdf_reading(data_folder, file):
    """
    Read in PDF file, convert to markdown via HTML conversion
    """
    pdf_file = fitz.open(f"{data_folder}/{file}")
    markdown_dict = {}
    for idx, page in enumerate(pdf_file):
        # convert pdf text --> html --> markdown to keep formatting
        # use table position to prevent table text to be added in this step
        page_html = pdf_to_html(page.get_text("html"))
        html_sanitized = sanitize_content(page_html, HTML_CLEANUP_RULES)
        markdown = convert_to_markdown(html_sanitized)
        markdown = sanitize_content(markdown, MARKDOWN_CLEANUP_RULES)
        markdown = clean_markdown(markdown)
        if len(markdown) > 10:
            markdown_dict[str(idx)] = markdown

    return markdown_dict

In [4]:
def delete_index():
    headers = {
        "api-key": os.getenv("SEARCH_KEY"),
        "Content-Type": "application/json",
    }
    response = requests.delete(
        os.getenv("SEARCH_ENDPOINT")
        + "/indexes/"
        + os.getenv("SEARCH_INDEX_NAME")
        + "?api-version=2020-06-30",
        headers=headers,
    )
    return response.status_code == 204


def check_index_deleted():
    headers = {
        "api-key": os.getenv("SEARCH_KEY"),
    }
    response = requests.get(
        os.getenv("SEARCH_ENDPOINT")
        + "/indexes/"
        + os.getenv("SEARCH_INDEX_NAME")
        + "?api-version=2020-06-30",
        headers=headers,
    )
    return response.status_code == 404

In [5]:
def init_acs():
    acs = AzureSearch(
        azure_search_endpoint=os.getenv("SEARCH_ENDPOINT"),
        azure_search_key=os.getenv("SEARCH_KEY"),
        index_name=os.getenv("SEARCH_INDEX_NAME"),
        embedding_function=embeddings.embed_query,
        fields=[
            SimpleField(
                name=FIELDS_ID,
                type=SearchFieldDataType.String,
                key=True,
                filterable=True,
            ),
            SearchableField(
                name=FIELDS_CONTENT,
                type=SearchFieldDataType.String,
                analyzer_name="nl.lucene",
            ),
            SearchField(
                name=FIELDS_CONTENT_VECTOR,
                type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True,
                vector_search_dimensions=1536,
                vector_search_profile_name="myHnswProfile",
            ),
            SearchableField(
                name=FIELDS_METADATA,
                type=SearchFieldDataType.String,
                analyzer_name="nl.lucene",
            ),
            SimpleField(
                name="title",
                type=SearchFieldDataType.String,
                key=False,
                filterable=True,
                sortable=True,
                facetable=True,
            ),
            SimpleField(
                name="page",
                type=SearchFieldDataType.String,
                key=False,
                filterable=True,
                sortable=True,
                facetable=True,
            ),
        ],
    )
    return acs

In [6]:
import pandas as pd

df

NameError: name 'df' is not defined

In [7]:
# AI SEARCH DATABASE ON FULL TEXT ECLI RECHTSPRAAK
acs = init_acs()

params = {
    "splitter": "TokenTextSplitter",
    "encoding_name": "cl100k_base",
    "chunk_size": 20480,
    "chunk_overlap": 2048,
}
token_splitter = TokenTextSplitter(
    encoding_name=params["encoding_name"],
    chunk_size=params["chunk_size"],
    chunk_overlap=params["chunk_overlap"],
)

for df_name in [
    "rechtspraak_metadata_2020.xlsx",
    "rechtspraak_metadata_2021.xlsx",
    "rechtspraak_metadata_2022.xlsx",
    "rechtspraak_metadata_2023.xlsx",
    "rechtspraak_metadata_2024.xlsx",
]:
    print(df_name)
    df = pd.read_excel(df_name)
    documents = []
    for index, row in df.iterrows():
        ecli = row["ecli"]
        fulltext = row["full_text"]

        chunks = token_splitter.split_text(fulltext)
        print(f"{ecli}: {len(chunks)}")
        for idx, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk,
                metadata={"title": ecli, "chunk_index": idx},
            )
            documents.append(doc.copy())

    # ADD ALL CHUNKS TO ACS
    acs.add_documents(documents)
    print("Creation of new index succesfull.")

rechtspraak_metadata_2020.xlsx
ECLI:NL:RVS:2020:5: 7
ECLI:NL:GHARL:2020:214: 5
ECLI:NL:GHARL:2020:216: 5
ECLI:NL:GHARL:2020:220: 4
ECLI:NL:GHARL:2020:219: 4
ECLI:NL:HR:2020:54: 1
ECLI:NL:RBLIM:2020:401: 5
ECLI:NL:RVS:2020:252: 2
ECLI:NL:CRVB:2020:241: 3
ECLI:NL:RBNNE:2020:430: 3
ECLI:NL:RBOVE:2020:376: 3
ECLI:NL:RBROT:2020:904: 8
ECLI:NL:RVS:2020:383: 2
ECLI:NL:GHARL:2020:1106: 4
ECLI:NL:HR:2020:250: 1
ECLI:NL:RBNNE:2020:724: 2
ECLI:NL:RBOVE:2020:837: 10
ECLI:NL:RVS:2020:593: 3
ECLI:NL:CBB:2020:126: 6
ECLI:NL:CRVB:2020:537: 3
ECLI:NL:GHSHE:2020:888: 11
ECLI:NL:HR:2020:424: 1
ECLI:NL:CBB:2020:176: 3
ECLI:NL:GHSHE:2020:974: 2
ECLI:NL:OGEAC:2020:57: 3
ECLI:NL:CBB:2020:163: 11
ECLI:NL:CRVB:2020:678: 2
ECLI:NL:CRVB:2020:680: 4
ECLI:NL:CRVB:2020:744: 2
ECLI:NL:GHSHE:2020:1079: 3
ECLI:NL:CRVB:2020:763: 5
ECLI:NL:CBB:2020:210: 2
ECLI:NL:RBNNE:2020:1441: 2
ECLI:NL:RBNHO:2020:2533: 6
ECLI:NL:RBROT:2020:2985: 4
ECLI:NL:OGEAA:2020:193: 6
ECLI:NL:HR:2020:588: 1
ECLI:NL:RBNHO:2020:2651: 2
ECLI:NL:HR

In [47]:
# # first delete current index
# # if delete_index():
# #     for _ in range(12):
# #         time.sleep(5)
# #         if check_index_deleted():
# print("Index has been deleted. Continue...")

# # init new index
# acs = init_acs()

# # init chunk parameters
# documents = []
# params = {
#     "splitter": "TokenTextSplitter",
#     "encoding_name": "cl100k_base",
#     "chunk_size": 1536,
#     "chunk_overlap": 32,
# }
# token_splitter = TokenTextSplitter(
#     encoding_name=params["encoding_name"],
#     chunk_size=params["chunk_size"],
#     chunk_overlap=params["chunk_overlap"],
# )

# ##PDF##
# document_path = "documents/"
# pdfs = {}
# for pdf_file in os.scandir(document_path):
#     if pdf_file.name.endswith(".pdf"):
#         filename = pdf_file.name
#         content_dict = pdf_reading(document_path, filename)
#         for page, content in content_dict.items():
#             chunks = token_splitter.split_text(content)
#             print(f"{filename}|{page}|{len(chunks)}")
#             for idx, chunk in enumerate(chunks):
#                 doc = Document(
#                     page_content=chunk,
#                     metadata={
#                         "title": filename,
#                         "page": str(page),
#                     },
#                 )
#                 documents.append(doc.copy())

# # ADD ALL CHUNKS TO ACS
# acs.add_documents(documents)
# print("Creation of new index succesfull.")

Index has been deleted. Continue...
01 De Cock en een strop voor Bobby - A.C. Baantjer.pdf|0|1
01 De Cock en een strop voor Bobby - A.C. Baantjer.pdf|2|1
01 De Cock en een strop voor Bobby - A.C. Baantjer.pdf|3|1
01 De Cock en een strop voor Bobby - A.C. Baantjer.pdf|4|1
01 De Cock en een strop voor Bobby - A.C. Baantjer.pdf|5|1
01 De Cock en een strop voor Bobby - A.C. Baantjer.pdf|6|1
01 De Cock en een strop voor Bobby - A.C. Baantjer.pdf|7|1
01 De Cock en een strop voor Bobby - A.C. Baantjer.pdf|8|1
01 De Cock en een strop voor Bobby - A.C. Baantjer.pdf|9|1
01 De Cock en een strop voor Bobby - A.C. Baantjer.pdf|10|1
01 De Cock en een strop voor Bobby - A.C. Baantjer.pdf|11|1
01 De Cock en een strop voor Bobby - A.C. Baantjer.pdf|12|1
01 De Cock en een strop voor Bobby - A.C. Baantjer.pdf|13|1
01 De Cock en een strop voor Bobby - A.C. Baantjer.pdf|14|1
01 De Cock en een strop voor Bobby - A.C. Baantjer.pdf|15|1
01 De Cock en een strop voor Bobby - A.C. Baantjer.pdf|16|1
01 De Cock en