In [None]:
import os
from dotenv import load_dotenv

load_dotenv()
MINIO_URL = os.environ['MINIO_URL']
MINIO_ACCESS_KEY = os.environ['MINIO_ACCESS_KEY']
MINIO_SECRET_KEY = os.environ['MINIO_SECRET_KEY']
if os.environ['MINIO_SECURE']=='true': MINIO_SECURE = True 
else: MINIO_SECURE = False 

### MinIO Utility Functions

In [None]:
import tempfile

from minio import Minio
from minio.error import S3Error


def get_pdf_from_minio(bucket_name: str, object_name: str) -> str:
    '''
    Retrieves an object from MinIO, saves it in a temp file and retiurns the 
    path to the temp file.
    '''
    try:
        # Create client with access and secret key
        client = Minio(MINIO_URL,  # host.docker.internal
                    MINIO_ACCESS_KEY,  
                    MINIO_SECRET_KEY, 
                    secure=MINIO_SECURE)

        # Generate a temp file.
        temp_dir = tempfile.gettempdir()
        temp_file = os.path.join(temp_dir, object_name)
        # Save object to file.
        client.fget_object(bucket_name, object_name, temp_file)
        
    except S3Error as s3_err:
        raise s3_err
    except Exception as err:
        raise err

    return temp_file


def save_chunk_to_minio(bucket_name: str, object_name: str, 
                        file_path: str, metadata: dict) -> None:
    '''
    Saves a doument chunk to MinIO.
    '''
    try:
        # Create client with access and secret key
        client = Minio(MINIO_URL,  # host.docker.internal
                    MINIO_ACCESS_KEY,  
                    MINIO_SECRET_KEY, 
                    secure=MINIO_SECURE)

        client.fput_object(bucket_name, object_name, file_path, metadata=metadata)

    except S3Error as s3_err:
        raise s3_err
    except Exception as err:
        raise err

### Read a File from MinIO

In [None]:
original_corpus_bucket_name = 'original-documents'
chunked_corpus_bucket_name = 'document-chunks'
#object_name = 'mobile-home-manual.pdf'
object_name = 'Attention is all you need.pdf'

temp_file = get_pdf_from_minio(original_corpus_bucket_name, object_name)

### Chunk File

In [None]:
import openparse

parser = openparse.DocumentParser()
parsed_basic_doc = parser.parse(temp_file)

print('Number of chunks:', len(parsed_basic_doc.nodes))

for node in parsed_basic_doc.nodes:
    print(node)

### Display Nodes

In [None]:
# you can also easily display the nodes on the actual document

pdf = openparse.Pdf(temp_file)
pdf.display_with_bboxes(parsed_basic_doc.nodes[0:4])

### Save Chunks to MinIO

In [None]:
type(parsed_basic_doc.dict())

In [None]:
import json

chunks = parsed_basic_doc.model_dump_json()
chunks = json.loads(chunks)

print(chunks.keys())
print(chunks['nodes'][0])
print(type(chunks['nodes'][0]))
chunks

In [None]:
chunks['table_parsing_kwargs']

In [None]:
type(chunks)

In [None]:
import json

temp_dir = tempfile.gettempdir()
temp_file = os.path.join(temp_dir, 'tmp.json')
print(temp_file)

metadata = {}
metadata['filename'] = chunks['filename']
metadata['num_pages'] = chunks['num_pages']
metadata['coordinate_system'] = chunks['coordinate_system']
metadata['table_parsing_kwargs'] = chunks['table_parsing_kwargs']
print(metadata)

chunk_num = 0
for node in chunks['nodes']:
    with open(temp_file, 'w') as f:
        f.write(json.dumps(node))
        #pickle.dump(node, f) # Serialize the node.
        chunk_name = os.path.splitext(object_name)[0]
        save_chunk_to_minio(chunked_corpus_bucket_name, f'{chunk_num} - {chunk_name}.json', 
                            temp_file, metadata)
    chunk_num += 1