In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support

from tqdm import tqdm

import pdfplumber
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import easyocr
import re

from preprocessing import *

In [6]:
def extract_paragraphs_with_ocr(pdf_path):
    structured_blocks = []
    page_sizes = {}
    order = 0

    with pdfplumber.open(pdf_path) as pdf:
        for page_idx, page in enumerate(pdf.pages, 0):
            page_sizes[str(page_idx)] = {"width": page.width, "height": page.height}
            lines = list(page.extract_text_lines())
            if lines and any(line.get("text", "").strip() for line in lines) and len(lines) >= 5:
                for line in lines:
                    box = [line["x0"], line["top"], line["x1"], line["bottom"]]
                    if any(coord < 0 or coord > max(page.width, page.height) for coord in box):
                        continue
                    block = {
                        "box": box,
                        "text": line["text"] or "[EMPTY]",
                        "page": page_idx,
                        "id": order,
                        "order": order
                    }
                    structured_blocks.append(block)
                    order += 1
            else:
                try:
                    pil_img = page.to_image(resolution=300).original
                    ocr_text = pytesseract.image_to_string(pil_img, lang='eng')
                except Exception as e:
                    print(f"OCR failed for page {page_idx} in {pdf_path}: {e}")
                    ocr_text = "[EMPTY]"
                block = {
                    "box": [0, 0, page.width, page.height],
                    "text": ocr_text or "[EMPTY]",
                    "page": page_idx,
                    "id": order,
                    "order": order
                }
                structured_blocks.append(block)
                order += 1

    return {
        "pages": page_sizes,
        "contents": structured_blocks
    }

test_pdf_dir = './pdf_input'
test_output_dir = './json_input'
os.makedirs(test_output_dir, exist_ok=True)

for fname in os.listdir(test_pdf_dir):
    if fname.lower().endswith('.pdf'):
        in_pdf = os.path.join(test_pdf_dir, fname)
        out_json = os.path.join(test_output_dir, os.path.splitext(fname)[0] + ".json")
        doc = extract_paragraphs_with_ocr(in_pdf)
        with open(out_json, "w", encoding="utf-8") as f:
            json.dump(doc, f, indent=2, ensure_ascii=False)
        print(f"Processed: {fname} -> {out_json}")

def main():
    print("=== DocIENet Model Inference for PDF Input ===")
    input_dir = "./json_input"
    output_dir = "./pdf_output"
    os.makedirs(output_dir, exist_ok=True)
    input_files = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.json') and not f.endswith('_predictions.json')]

    if not input_files:
        print("No JSON files found in ./json_input. Please run preprocess_pdf.py.")
        return

    try:
        tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-tiny')
    except Exception as e:
        print(f"Failed to load tokenizer from ./bert-tiny: {e}")
        return

    device = torch.device('cpu')
    print(f"\nUsing device: {device}")

    try:
        inference_model = FastInferenceModel("./models")
    except Exception as e:
        print(f"Failed to initialize FastInferenceModel: {e}")
        return

    for input_file in input_files:
        print(f"Testing inference on {input_file}")
        try:
            results = inference_model.predict(input_file)
            # Filter predictions with confidence > 0.8
            filtered_results = [pred for pred in results if pred['confidence'] > 0.7]
            output_file = os.path.join(output_dir, os.path.basename(input_file).replace('.json', '_predictions.json'))
            if filtered_results:
                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump(filtered_results, f, indent=2, ensure_ascii=False)
                print(f"Saved {len(filtered_results)} predictions to {output_file}")
            else:
                print(f"No predictions with confidence > 0.8 for {input_file}. Skipping output.")
            print("\nPredictions:")
            for pred in results:
                print(f"ID: {pred['id']}")
                print(f"  Pred label: {pred['label']}")
                print(f"  Pred order: {pred['order']}")
                print(f"  Pred parent_id: {pred['parent_id']}")
                print(f"  Text: {pred['text']}")
                print(f"  Confidence: {pred['confidence']:.4f}")
                print("---")
        except Exception as e:
            print(f"Error processing {input_file}: {e}")

if __name__ == "__main__":
    main()

Processed: DV_mini_project.pdf -> ./json_input\DV_mini_project.json
Processed: file01.pdf -> ./json_input\file01.json
Processed: file02.pdf -> ./json_input\file02.json
Processed: file03.pdf -> ./json_input\file03.json
Processed: file04.pdf -> ./json_input\file04.json
Processed: file05.pdf -> ./json_input\file05.json
Processed: Hackathon 2025 - Problem Statement Summary (1)b0ee386.pdf -> ./json_input\Hackathon 2025 - Problem Statement Summary (1)b0ee386.json
=== DocIENet Model Inference for PDF Input ===

Using device: cpu
Testing inference on ./json_input\DV_mini_project.json
Error loading ground truth for ./json_input\DV_mini_project.json: [Errno 2] No such file or directory: './json_output\\DV_mini_project.json'
Inference completed in 0.08 seconds
Saved 45 predictions to ./pdf_output\DV_mini_project_predictions.json

Predictions:
ID: 1
  Pred label: title
  Pred order: 1
  Pred parent_id: 0
  Text: DV Mini Project
  Confidence: 0.5969
---
ID: 2
  Pred label: title
  Pred order: 0
  P