In [None]:
from pathlib import Path

DATA_DIR = Path("../data/input")
DATA_DIR.mkdir(exist_ok=True, parents=True)

TRUE_TARGET_DIR = Path("../data/target")
TRUE_TARGET_DIR.mkdir(exist_ok=True, parents=True)

PROCESSED_DATA_DIR = Path("../data/processed/")
PROCESSED_DATA_DIR.mkdir(exist_ok=True, parents=True)

SHOTS_DIR = Path("../data/shots/")
SHOTS_DIR.mkdir(exist_ok=True, parents=True)

### Images

In [None]:
import fitz
from tqdm.auto import tqdm
import json

for doc_path in tqdm(DATA_DIR.glob("*.pdf")):
    doc = fitz.open(doc_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        pix = page.get_pixmap(dpi=55)
        (PROCESSED_DATA_DIR / doc_path.stem / "images").mkdir(
            exist_ok=True, parents=True
        )
        output_path = PROCESSED_DATA_DIR / doc_path.stem / "images" / f"{page_num}.png"
        pix.save(output_path)

### Jsons

In [None]:
for doc_path in tqdm(DATA_DIR.glob("*.pdf")):
    doc = fitz.open(doc_path)

    pages = []
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        pages.append(page.get_text())

    (PROCESSED_DATA_DIR / doc_path.stem).mkdir(exist_ok=True, parents=True)

    output_json = PROCESSED_DATA_DIR / doc_path.stem / f"text.json"
    with open(output_json, "w") as f:
        json.dump(pages, f, indent=4, ensure_ascii=False)

### Shots

#### Start classification

In [None]:
# I don't want ot put this in "Images" part because i'm not sure if it's the final structure

for doc_path in tqdm(DATA_DIR.glob("*.pdf")):
    with open(TRUE_TARGET_DIR / f"{doc_path.stem}.json", "r") as f:
        target = json.load(f)

    target = {form["start_page"]: form["document_type"] for form in target}

    doc = fitz.open(doc_path)
    for page_num in target:
        page = doc.load_page(page_num)
        pix = page.get_pixmap(dpi=55)
        (SHOTS_DIR / doc_path.stem).mkdir(exist_ok=True, parents=True)
        output_path = SHOTS_DIR / doc_path.stem / f"{target[page_num]}_{page_num}.png"
        pix.save(output_path)