In [None]:
import torch

if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))
else:
    print("No GPU found.")

In [None]:
import datetime
import logging
import time
from pathlib import Path
import pandas as pd
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.utils.export import generate_multimodal_pages
from docling.utils.utils import create_hash
_log = logging.getLogger(__name__)
IMAGE_RESOLUTION_SCALE = 2.0
def main():
    logging.basicConfig(level=logging.INFO)

    input_doc_path = Path("pdfs/smart_baby_camera.pdf")
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    start_time = time.time()

    conv_res = doc_converter.convert(input_doc_path)

    output_dir.mkdir(parents=True, exist_ok=True)

    rows = []
    for (
        content_text,
        content_md,
        content_dt,
        page_cells,
        page_segments,
        page,
    ) in generate_multimodal_pages(conv_res):

        dpi = page._default_image_scale * 72

        rows.append(
            {
                "document": conv_res.input.file.name,
                "hash": conv_res.input.document_hash,
                "page_hash": create_hash(
                    conv_res.input.document_hash + ":" + str(page.page_no - 1)
                ),
                "image": {
                    "width": page.image.width,
                    "height": page.image.height,
                    "bytes": page.image.tobytes(),
                },
                "cells": page_cells,
                "contents": content_text,
                "contents_md": content_md,
                "contents_dt": content_dt,
                "segments": page_segments,
                "extra": {
                    "page_num": page.page_no + 1,
                    "width_in_points": page.size.width,
                    "height_in_points": page.size.height,
                    "dpi": dpi,
                },
            }
        )

    # Generate one parquet from all documents
    df = pd.json_normalize(rows)
    now = datetime.datetime.now()
    output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
    df.to_parquet(output_filename)

    end_time = time.time() - start_time

    _log.info(
        f"Document converted and multimodal pages generated in {end_time:.2f} seconds."
    )

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd

# Substitua pelo caminho do arquivo Parquet gerado
parquet_file = "scratch/multimodal_2024-12-20_143138.parquet"

# Carregar o Parquet em um DataFrame
df = pd.read_parquet(parquet_file)

# Visualizar as primeiras linhas
print(df.head())


In [None]:
# Ver o texto da primeira página
print(df['contents'][0])

# Listar os textos de todas as páginas
for idx, content in enumerate(df['contents']):
    print(f"Página {idx + 1}:")
    print(content)
    print("=" * 40)


In [None]:
from PIL import Image
import io

# Selecionar a primeira linha do DataFrame
image_width = df.iloc[0]['image.width']
image_height = df.iloc[0]['image.height']
image_bytes = df.iloc[0]['image.bytes']

# Criar a imagem a partir dos bytes
image = Image.frombytes('RGB', (image_width, image_height), image_bytes, 'raw')

# Mostrar a imagem
image.show()

In [None]:
print(df['contents'][0])  # Ver o texto extraído da página 2, por exemplo.


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Inicializar o splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

# Criar os chunks com metadados
chunks = []
for _, row in df.iterrows():
    splits = text_splitter.split_text(row['contents'])
    for split in splits:
        chunks.append({
            "text": split,
            "page_num": row['extra.page_num'],  # Metadado do número da página
            "document_name": row['document']  # Nome do documento
        })


In [None]:
from sentence_transformers import SentenceTransformer

# Carregar o modelo
embedding_model = SentenceTransformer('sentence-transformers/gtr-t5-large')

texts = [chunk['text'] for chunk in chunks]
embeddings = embedding_model.encode(texts, show_progress_bar=True)


In [None]:
import numpy as np

# Adicionar os embeddings aos chunks
for chunk, embedding in zip(chunks, embeddings):
    chunk['embedding'] = embedding.tolist()  # Converter para lista para compatibilidade com JSON/serialização

In [None]:
import pickle

# Salvar os chunks em um arquivo Pickle
with open("chunks_with_embeddings.pkl", "wb") as f:
    pickle.dump(chunks, f)

print("Chunks com embeddings salvos em 'chunks_with_embeddings.pkl'")
