# Extracción

Leemos todos los PDFs y extraemos los metadatos, el texto y las imágenes. Así no tengo que laburar con PDFs que es más molesto.

In [1]:
import sys
sys.path.append('..')

In [2]:
import hashlib
import json
import os
from pathlib import Path

from pypdf import PdfReader
from tqdm import tqdm

In [3]:
DATA_PATH = Path("../data")
EXTRACT_PATH = Path("../data-extracted")

In [4]:
def process_text(pages):
    # join pages to one string
    text = "\n".join([page.extract_text() for page in pages])
    text = text.strip()

    return text

In [5]:
def process_images(filename, pages):
    images = [file.image for page in pages for file in page.images]

    # remove QR images 370x370 and 330x330
    images = [image for image in images if image.size != (370, 370) and image.size != (330, 330)]

    # remove duplicates data with hashes (exact match)
    hashes = set()
    for i, image in enumerate(images):
        hash = hashlib.md5(image.tobytes()).hexdigest()
        if hash in hashes:
            images[i] = None
        else:
            hashes.add(hash)

    # save images
    image_paths = []
    if len(images) > 0:
        folder = EXTRACT_PATH / filename
        os.makedirs(folder, exist_ok=True)
        for i, image in enumerate(images):
            if image is not None:
                image_path = folder / f"{i}.png"
                image.save(image_path)
                image_paths.append(str(image_path.resolve()))

    return image_paths

In [6]:
files = os.listdir(DATA_PATH)
samples = []

for filename in tqdm(files):
    if not filename.endswith(".pdf"):
        continue

    filepath = DATA_PATH / filename
    reader = PdfReader(filepath)

    text = process_text(reader.pages)
    images = process_images(filename, reader.pages)

    samples.append({
        "filename": filename,
        "size": os.path.getsize(filepath),
        "pages": len(reader.pages),
        "text": text,
        "images": images,
    })

with open(EXTRACT_PATH / "samples.json", "w") as f:
    json.dump(samples, f, indent=2)

100%|██████████| 8411/8411 [6:08:40<00:00,  2.63s/it]  
