In [8]:
%load_ext autoreload
%autoreload 2

import os
import sys
from pprint import pprint
from dotenv import load_dotenv
import pathlib
# from tqdm import tqdm
from tqdm.autonotebook import tqdm
from pathlib import Path
from pprint import pprint
import numpy as np
import hashlib
import re
import random

import polars as pl
from glob import glob

from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LTTextContainer, LTChar
import fitz
import pymupdf
from deep_translator import GoogleTranslator
from googletrans import Translator
import asyncio
import nest_asyncio

from openai import OpenAI
from google import genai
from pinecone import Pinecone
from langchain_openai import OpenAIEmbeddings
import tiktoken

from pinecone_text.sparse import BM25Encoder


root_dir = Path(os.getcwd()).parent.parent
sys.path.insert(0, str(root_dir))


from src.d01_data.data_izeta import *


pl.Config.set_fmt_str_lengths(300)
pl.Config.set_tbl_rows(100)
pl.Config.set_tbl_cols(20);

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from src.d01_data.data import *
from src.d00_utils.utils import *
from src.d03_modeling.modeling import *

load_dotenv('../../.env')

raw_path = root_dir / 'data' / '01_raw'
intermediate_path = root_dir / 'data' / '02_intermediate'
output_path = root_dir / 'data' / '04_model_output'

In [10]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_MODEL = os.getenv('OPENAI_MODEL')
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
GEMINI_MODEL = os.getenv('GEMINI_MODEL')

client = genai.Client(api_key=GEMINI_API_KEY)

In [45]:
# Abrir el PDF original
doc_path = raw_path / 'formularios' / 'EX-10.pdf' # 'EX-10.pdf' 'EX-17.pdf'
doc = fitz.open(doc_path)

doc_output_path = raw_path / 'formularios' / 'EX_10_translated.pdf' # 'EX_10_translated.pdf' 'EX_17_translated.pdf'

## Traduccion PDFs

### PyMuPDF Youtube

In [6]:
langs_dict = {
    'arabic': 'ar',
    'chinese (simplified)': 'zh-CN',
    'chinese (traditional)': 'zh-TW',
    'english': 'en',
    'french': 'fr',
    'spanish': 'es',
    'ukrainian': 'uk',
}

In [7]:
es_translator = GoogleTranslator(source='es', target='en')

In [51]:
doc = fitz.open(doc_path)

for page in doc:
    blocks = page.get_text('blocks', flags=pymupdf.TEXT_DEHYPHENATE)

    for block in blocks:
        bbox = block[:4]

        original = block[4]

        translated = es_translator.translate(original)

        page.draw_rect(bbox, color=None, fill=pymupdf.pdfcolor['white'])

        page.insert_htmlbox(bbox, translated)

#doc.save(doc_output_path)

[(161.16000366210938, 90.34320068359375, 486.6900634765625, 101.51457977294922, ' \n \n', 0, 0), (33.959999084472656, 99.84423828125, 36.45009231567383, 110.87458038330078, ' \n', 1, 0), (224.04000854492188, 52.24166488647461, 337.5514831542969, 63.670806884765625, 'Solicitud de autorización de  \n', 2, 0), (210.83999633789062, 63.64166259765625, 348.57757568359375, 75.07080078125, 'residencia/residencia y trabajo por \n', 3, 0), (220.67999267578125, 75.16165924072266, 338.7381896972656, 86.5907974243164, 'circunstancias excepcionales \n', 4, 0), (227.75999450683594, 86.56166076660156, 331.6712646484375, 97.99079895019531, '(LO 4/2000 y RD 557/2011) \n', 5, 0), (258.9599914550781, 30.88166046142578, 284.001220703125, 42.3108024597168, 'EX-10 \n', 6, 0), (33.959999084472656, 110.89146423339844, 34.425193786621094, 113.23237609863281, ' \n', 7, 0), (286.20001220703125, 113.1421127319336, 286.71002197265625, 115.4013442993164, ' \n', 8, 0), (33.959999084472656, 115.49308013916016, 36.6964

### Funcion Buena

In [66]:
nest_asyncio.apply()

async def traducir_pdf_async(translator, doc, doc_output_path):
    
    for page in doc:
        redactions = []
        text_dict = page.get_text('dict')

        if 'blocks' not in text_dict:
            continue

        for block in text_dict['blocks']:
            if 'lines' not in block:
                continue
            for line in block['lines']:
                for span in line['spans']:
                    original_text = span['text']

                    if not original_text.strip():
                        continue

                    # Traducir el texto
                    result = await translator.translate(original_text, dest='en')
                    translated_text = result.text

                    rect = fitz.Rect(span['bbox'])
                    redactions.append((rect, translated_text))

                    # Crea una anotacion de redaccion que va a cubrir de blanco todo el area del rectangulo
                    page.add_redact_annot(rect, fill=(1, 1, 1))
        
        # Aplica las anotaciones de redaccion, en este caso elimina los textos originales de la pagina
        page.apply_redactions()

        # Insertar el texto traducido en cada bbox
        for rect, translated_text in redactions:
            page.insert_htmlbox(
                rect,
                translated_text
            )

    # Guardar el PDF traducido
    doc.save(doc_output_path)

In [67]:
# Funcion async
doc = fitz.open(doc_path)
translator = Translator()

asyncio.run(traducir_pdf_async(translator, doc, doc_output_path))

### Funcion Semi-Buena

In [64]:
def traducir_pdf(es_translator, doc, doc_output_path):

    for page in doc:
        text_dict = page.get_text('dict')

        if 'blocks' not in text_dict:
            continue

        for block in text_dict['blocks']:
            if 'lines' not in block:
                continue
            for line in block['lines']:
                for span in line['spans']:
                    original_text = span['text']

                    if not original_text.strip():
                        continue

                    # Traducir el texto
                    translated_text = es_translator.translate(original_text)
                    if not isinstance(translated_text, str):
                        translated_text = original_text

                    # Guardar la posición de los rectangulos junto al texto traducido
                    rect = fitz.Rect(span['bbox'])

                    # Añadir anotación de redacción para borrar el contenido original
                    page.add_redact_annot(rect, text=translated_text , fill=(1, 1, 1))
        
        # Aplicar las redacciones
        page.apply_redactions()

    # Guardar el PDF traducido
    doc.save(doc_output_path)

In [65]:
doc = fitz.open(doc_path)
es_translator = GoogleTranslator(source='es', target='en')

traducir_pdf(es_translator, doc, doc_output_path)

### Código Edgar

In [73]:
def traducir_pdf(doc_path, doc_output_path):

    all_text = extract_pdf_text(doc_path)
    doc = fitz.open(doc_path)

    for page in doc:
        redactions = []
        page_texts = [b[4].strip() for b in page.get_text('blocks')]
        
        normalized_page_texts = []
        for p in page_texts:
            normalized_page_texts.extend(normalize_string(p))
        normalized_page_texts = [remove_number_parenthesis_space(p) for p in normalized_page_texts if len(p) > 2]

        final_texts_to_search = normalized_page_texts
        
        traducciones = translate(client=client,
                         model=GEMINI_MODEL,
                         language='en',
                         palabras='\n'.join(random.sample(final_texts_to_search, len(final_texts_to_search))),
                         contexto=all_text)

        traducciones = {trad.palabra:trad.traduccion for trad in traducciones.parsed.traducciones}
        
        text_dict = page.get_text('dict')

        if 'blocks' not in text_dict:
            continue

        for block in text_dict['blocks']:
            if 'lines' not in block:
                continue
            for line in block['lines']:
                for span in line['spans']:
                    original_text = span['text']

                    if not original_text.strip():
                        continue

                    # Traducir el texto
                    translate_key = match_words(word=original_text, candidates=list(traducciones.keys()))
        
                    if translate_key:
                        translated_text = traducciones[translate_key]
                    else:
                        translated_text = original_text

                    # Guardar la posición de los rectangulos junto al texto traducido
                    rect = fitz.Rect(span['bbox'])
                    redactions.append((rect, translated_text))

                    # Añadir anotación de redacción para borrar el contenido original
                    page.add_redact_annot(rect, fill=(1, 1, 1))
        
        # Aplicar las redacciones
        page.apply_redactions()

        # Insertar el texto traducido en cada bbox
        for rect, translated_text in redactions:
            page.insert_htmlbox(
                rect,
                translated_text
            )

    # Guardar el PDF traducido
    doc.save(doc_output_path)

In [74]:
traducir_pdf(doc_path, doc_output_path)



No match for word --  
No match for word - 
No match for word H 
No match for word (2)
No match for word (3)
No match for word V 
No match for word Sp  
No match for word 2)
No match for word -
No match for word - 
No match for word H 
No match for word (3)
No match for word V 
No match for word Sp  
No match for word (2)
No match for word 3)
No match for word (5)
No match for word      □




No match for word …………………………………………………………………………………………………………
No match for word 5)
No match for word (7)
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word     □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word     □
No match for word     □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □
No match for word )
No match for word □
No match for word □
No match for word □
No match for word □
No match for word □




No match for word □
No match for word □




No match for word H
No match for word M
No match for word *
No match for word (2) 
No match for word (3)
No match for word S
No match for word oltero
No match for word /
No match for word  C
No match for word asado
No match for word /
No match for word  V
No match for word iudo
No match for word /
No match for word  D
No match for word ivorciado
No match for word /
No match for word  S
No match for word p
No match for word arado
No match for word (4)
No match for word (5) 
No match for word (6) 
No match for word ). 
No match for word (7)
No match for word (8)
No match for word dpd@mitramiss.es


In [48]:
pdfs = [doc_path]

for pdf in pdfs:
    
    doc = pymupdf.open(pdf)
    page = doc[0]
    all_text = page.get_text()
    
    page_texts = [b[4].strip() for b in page.get_text('blocks')]

    normalized_page_texts = []
    for p in page_texts:
        normalized_page_texts.extend(normalize_string(p))
    normalized_page_texts = [remove_number_parenthesis_space(p) for p in normalized_page_texts if len(p) > 2]

    final_texts_to_search = normalized_page_texts
        
    break

In [49]:
traducciones = translate(client=client,
                         model=GEMINI_MODEL,
                         language='en',
                         palabras='\n'.join(random.sample(final_texts_to_search, len(final_texts_to_search))),
                         contexto=all_text)

traducciones = {trad.palabra:trad.traduccion for trad in traducciones.parsed.traducciones}

In [50]:
for search_text in final_texts_to_search:
    
    hits = page.search_for(search_text)
    if len(hits) == 0:
        print(f'NO HIT for text: {search_text}')
        
    for hit in hits[:1]:
        translate_key = match_words(word=search_text, candidates=list(traducciones.keys()))
        
        if translate_key:
            traduccion = traducciones[translate_key]
        else:
            print(f'Texto: {search_text} sin traducción')
            traduccion = search_text
        hit.x1 += 50
        # page.draw_rect(hit, color=None, fill=pymupdf.pdfcolor['white'])
        # page.insert_htmlbox(hit, ' '.join(traduccion.split('\n')))
        page.add_redact_annot(hit, ' '.join(traduccion.split('\n')), fontname='helv', fontsize=20)
    page.apply_redactions() 

doc.save("replaced.pdf", garbage=0, deflate=True)