In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
from pprint import pprint
from dotenv import load_dotenv
import pathlib
# from tqdm import tqdm
from tqdm.autonotebook import tqdm
from pathlib import Path
from pprint import pprint
import hashlib

import polars as pl
from glob import glob

from openai import OpenAI
from pinecone import Pinecone
from langchain_openai import OpenAIEmbeddings

from google import genai
from google.genai import types
from langchain_openai import ChatOpenAI

import PIL.Image
import pymupdf


root_dir = Path(os.getcwd()).parent.parent
sys.path.insert(0, str(root_dir))

pl.Config.set_fmt_str_lengths(300)
pl.Config.set_tbl_rows(100)
pl.Config.set_tbl_cols(20);

  from tqdm.autonotebook import tqdm


In [2]:
from src.d01_data.data import (read_json, json_dump, pdf_to_images, save_text_to_file, get_orientacion_docs)
from src.d03_modeling.pydantic_schemas import DocumentoInfo, Instrucciones, Formulario, SiNo
from src.d03_modeling.prompts import EXTRACT_FORM_TYPE, EXTRACT_FORM_INSTRUCTIONS, EXTRACT_FORM_MAIN_SECTIONS, is_doc_present_in_text_message
from src.d00_utils.utils import split_pdf_into_pages, remove_duplicates, campos_to_srt
from src.d03_modeling.modeling import get_imputation_campos, get_imputation_seleccion, create_resumen


load_dotenv('../../.env')
 
raw_path = root_dir / 'data' / '01_raw'
intermediate_path = root_dir / 'data' / '02_intermediate'
clean_path = root_dir / 'data' / '03_clean'
output_path = root_dir / 'data' / '04_model_output'

In [3]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_MODEL = os.getenv('OPENAI_MODEL')
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
GEMINI_MODEL = os.getenv('GEMINI_MODEL')

client = genai.Client(api_key=GEMINI_API_KEY)

In [4]:
formulario_files = sorted(glob(str(raw_path / 'formularios' / '*.pdf')))
for formulario_file in formulario_files:
    split_pdf_into_pages(formulario_file, output_path=intermediate_path / 'formularios' / 'español')

File saved in c:\Users\edgarmp\Desktop\Proyectos\ODIASEIA hackaton\data\02_intermediate\formularios\español\EX-10\EX-10_0.pdf
File saved in c:\Users\edgarmp\Desktop\Proyectos\ODIASEIA hackaton\data\02_intermediate\formularios\español\EX-10\EX-10_1.pdf
File saved in c:\Users\edgarmp\Desktop\Proyectos\ODIASEIA hackaton\data\02_intermediate\formularios\español\EX-10\EX-10_2.pdf
File saved in c:\Users\edgarmp\Desktop\Proyectos\ODIASEIA hackaton\data\02_intermediate\formularios\español\EX-17\EX-17_0.pdf
File saved in c:\Users\edgarmp\Desktop\Proyectos\ODIASEIA hackaton\data\02_intermediate\formularios\español\EX-17\EX-17_1.pdf
File saved in c:\Users\edgarmp\Desktop\Proyectos\ODIASEIA hackaton\data\02_intermediate\formularios\español\EX-17\EX-17_2.pdf


In [5]:
formularios = sorted(glob(str(intermediate_path / 'formularios' / 'español' / '*/')))

orientacion_files = glob(str(intermediate_path / 'orientacion' / '*.json'))
processes = {Path(orientacion_file).stem:get_orientacion_docs(orientacion_file) for orientacion_file in orientacion_files}
processes_with_text = {d:v for d,v in processes.items() if v[0]}

In [None]:
for formulario in tqdm(formularios):
    pdfs = sorted(glob(str(Path(formulario) / '*.pdf')))
    pdfs = [Path(pdf).read_bytes() for pdf in pdfs]
    
    
    config = types.GenerateContentConfig(
                system_instruction='Eres un analista de documentos sobre la inmigración española',
                max_output_tokens=100,
                top_p= 0.6,
                temperature= 0.5,
                response_mime_type= 'application/json',
                response_schema=DocumentoInfo,
            )
    
    doc_info = client.models.generate_content(
                model=GEMINI_MODEL,
                contents=[
                    types.Part.from_bytes(
                        data=pdfs[0],
                        mime_type='application/pdf'),
                    EXTRACT_FORM_TYPE],
                config=config)


    doc_code = doc_info.parsed.codigo_documento
    doc_laws = '\n'.join([ley.ley for ley in doc_info.parsed.leyes])
    doc_laws = f'Leyes en las que sustentan:\n{doc_laws}'
    
    
    config.response_schema = Instrucciones
    config.max_output_tokens = 800

    doc_instructions = client.models.generate_content(
                            model=GEMINI_MODEL,
                contents=[
                    types.Part.from_bytes(
                        data=pdfs[-1],
                        mime_type='application/pdf'),
                    EXTRACT_FORM_INSTRUCTIONS],
                config=config)

    doc_instructions = '\n'.join([f'{ins.numero}) {ins.instruccion}' for ins in doc_instructions.parsed.Instrucciones])
    doc_instructions = f'Instrucciones de como rellenar el documento:\n{doc_instructions}'

    
    config.response_schema = Formulario
    config.max_output_tokens = 1_200

    secciones_str = 'Secciones a rellenar del formulario:'

    all_campos_a_imputar = {}
    all_campos_a_seleccionar = {}

    for i, pdf in enumerate(pdfs[:-1]):
        
        doc_sections = client.models.generate_content(
            model=GEMINI_MODEL,
                contents=[
                    types.Part.from_bytes(
                        data=pdf,
                        mime_type='application/pdf'),
                    EXTRACT_FORM_MAIN_SECTIONS],
                config=config)

        
        if doc_sections.parsed is not None:
            page_sections = '\n\n'.join([f'Sección {sec.numero}) {sec.nombre}.\nExplicacion: {sec.informacion}' for sec in doc_sections.parsed.secciones])
            secciones_str = f'{secciones_str}\n\n{page_sections}'
            
        campos_a_imputar = get_imputation_campos(client=client, model=GEMINI_MODEL, pdf=pdf)
        campos_a_imputar = remove_duplicates(campos_a_imputar.parsed.campos)
        
        campos_a_seleccionar = get_imputation_seleccion(client=client, model=GEMINI_MODEL, pdf=pdf)
        campos_a_seleccionar = remove_duplicates(campos_a_seleccionar.parsed.campos)
        
        all_campos_a_imputar[i] = campos_a_imputar
        all_campos_a_seleccionar[i] = campos_a_seleccionar
    
    campos_a_imputar = campos_to_srt(campos_obj=all_campos_a_imputar, imputados=True)
    campos_a_seleccionar = campos_to_srt(campos_obj=all_campos_a_seleccionar, imputados=False)
    
    campos_str = f'Campos a rellenar:\n\n{campos_a_imputar}\n{campos_a_seleccionar}'
            
    messages = is_doc_present_in_text_message([(doc_code, text[1]) for _, text in processes_with_text.items()])
    llm = ChatOpenAI(
        model=OPENAI_MODEL,
        temperature=0.4,
        max_tokens=10,
        top_p=0.6,
        timeout=None,
        max_retries=2
    ).with_structured_output(SiNo, include_raw=False, method ='json_schema', strict=True)

    question_results = llm.batch(messages)
    question_results = [True if res.resultado == 'si' else False for res in question_results]

    final_processes = []
    for (_, process), isin in zip(processes_with_text.items(), question_results):
        if isin:
            final_processes.append(process[0].strip())
        
    final_processes = '\n'.join(final_processes)
    final_processes = f'Procesos donde este formulario es necesario:\n{final_processes}'
    
    final_doc_text = '\n\n'.join([f'Código del documento: {doc_code}', doc_laws, final_processes, doc_instructions, secciones_str])
    resumen = create_resumen(client=client, model=GEMINI_MODEL, resumen=final_doc_text)
    resumen = resumen.parsed.resumen
    
    final_doc_text = f'Resumen del documento:\n{resumen}\n\n{final_doc_text}\n\n{campos_str}'
    output_path = str((clean_path / 'formularios' / Path(formulario).stem)) + '.txt'
    save_text_to_file(final_doc_text, output_path)
    

  0%|          | 0/2 [01:21<?, ?it/s]
