In [2]:
%load_ext autoreload
%autoreload 2

import os
import sys
from pprint import pprint
from dotenv import load_dotenv
import pathlib
# from tqdm import tqdm
from tqdm.autonotebook import tqdm
from pathlib import Path
from pprint import pprint
import numpy as np
import hashlib
import re

import polars as pl
from glob import glob

from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar

from openai import OpenAI
from pinecone import Pinecone
from langchain_openai import OpenAIEmbeddings
import tiktoken

from pinecone_text.sparse import BM25Encoder


root_dir = Path(os.getcwd()).parent.parent
sys.path.insert(0, str(root_dir))


from src.d01_data.data_izeta import *


pl.Config.set_fmt_str_lengths(300)
pl.Config.set_tbl_rows(100)
pl.Config.set_tbl_cols(20);

  from tqdm.autonotebook import tqdm


In [3]:
from src.d01_data.data import (read_json, create_index_if_not_exists, upsert_vectors_in_batches, json_dump)

from src.d00_utils.utils import (get_tokens_len, generate_sparse_vector_in_batches,
                                 dict_to_document_boe, metadata_to_uuid)

load_dotenv('../../.env')

raw_path = root_dir / 'data' / '01_raw'
intermediate_path = root_dir / 'data' / '02_intermediate'
output_path = root_dir / 'data' / '04_model_output'

In [4]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL')

MODEL = os.getenv('OPENAI_MODEL')
print(f'Model used: {MODEL}')

Model used: gpt-4o-mini-2024-07-18


In [9]:
def estructurar_autorizaciones(pdf_path, pdf_name):
    """
    Organizes the content of an authorization PDF into a hierarchical structure based on predefined sections. 
    The classification is performed using text patterns, element coordinates, and font size analysis.

    Parameters:
    - pdf_path (str): Path to the PDF file.
    - pdf_name (str): Name of the PDF, used to encapsulate the structure in the result.

    Returns:
    - dict: Hierarchical structure of the PDF content.
    """
    
    # Define patterns for different levels in the Izeta PDFs
    section_1_pattern = re.compile(r'Tipo de autorización')
    section_2_pattern = re.compile(r'Normativa básica')
    section_3_pattern = re.compile(r'Requisitos')
    section_4_pattern = re.compile(r'Documentación exigible')
    section_5_pattern = re.compile(r'Procedimiento')

    estructura = {}
    estructura_final = {}
    current_section = 'Sin Seccion'
    indice_flag = True  # Flag to determine if we are inside the table of contents

    
    for page_layout in extract_pages(pdf_path):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                text = element.get_text().strip()
                x0, y0, x1, y1 = element.bbox

                # Create a set with the font size of eache element text
                font_sizes = set()

                for text_line in element:
                    if isinstance(text_line, LTTextContainer):
                        for character in text_line:
                            if isinstance(character, LTChar):
                                font_sizes.add(character.size)

                # Skip page numbers
                if not font_sizes or (max(list(font_sizes)) < 8):
                    continue

                # Detect the end of the table of contents
                if (max(list(font_sizes)) > 17):
                    indice_flag = False
                    final_name = text.replace('\n', ' ')

                # Skip lines that belong to the table of contents
                if indice_flag:
                    continue

                # Skip irrelevant text
                if (x0 < 52) or (x0 > 550):
                    continue

                # Filter only text that corresponds to numbered sections
                if (re.match(section_1_pattern, text) or re.match(section_2_pattern, text) or re.match(section_3_pattern, text) or re.match(section_4_pattern, text) or re.match(section_5_pattern, text)) and ((x0 < 63)):  # Main section
                    current_section = text

                # If it's not a new section, subsection, or sub-subsection, save the content
                elif text:                   
                    estructura.setdefault(current_section, {}) \
                              .setdefault('content', []).append(text)

    estructura_final = {final_name: estructura}

    return estructura_final

In [10]:
for pdf_file in os.listdir(raw_path / 'orientacion'):
    #Create PDF Path
    path = raw_path / 'orientacion' / pdf_file
    print(f'Parseando el PDF: {pdf_file}')
    
    # Estructurar Autorizacion PDF
    pdf_estructurado = estructurar_autorizaciones(path, path.name)

    # Hacer un split para tener chunks de maximo 1000 tokens
    chunks = []

    for k,v in obtener_articulos_diccionario(documento=pdf_estructurado, prefijo='', articulos=None, temario=True).items():
        enc = tiktoken.encoding_for_model(MODEL)
        text_tokens = len(enc.encode(v[0]))

        if text_tokens < 1000:
            chunk_content = f'{k}: \n {v[0]}'
            chunks.append(chunk_content)
        else:
            text_chunks = chunks_text(text=v[0], max_tokens=1000, model=MODEL)
            for chunk, chunk_idx in zip(text_chunks, range(len(text_chunks))):
                chunk_content = f'{k} Parte {chunk_idx+1}: \n {chunk}'
                chunks.append(chunk_content)

    # Create a json file with each chunk and context content
    pdf_estructurado_split = {}

    for i in range(len(chunks)):
        pdf_estructurado_split[i] = {
            'chunk_content': chunks[i]
        }
    # Create json path 
    path_start = Path(str(path.parent).replace('01_raw', '02_intermediate'))
    #print(path_start)
    json_name = path.name.replace(path.suffix, '.json')
    #print(json_name)
    pdf_estructurado_path = path_start / json_name
    #print(pdf_estructurado_path)
    save_json(pdf_estructurado_path, pdf_estructurado_split)


Parseando el PDF: HI 108.pdf
Parseando el PDF: HI 35.pdf
Parseando el PDF: HI 36.pdf
Parseando el PDF: HI 37.pdf
Parseando el PDF: HI 38.pdf
Parseando el PDF: HI 39.pdf
Parseando el PDF: HI 40.pdf
Parseando el PDF: HI 41.pdf
Parseando el PDF: HI 42.pdf
Parseando el PDF: HI 43.pdf
Parseando el PDF: HI 44.pdf
Parseando el PDF: HI 45.pdf
