In [6]:
# !pip install python-bidi arabic-reshaper reportlab Pillow pdf2image
# !apt-get install poppler-utils

In [11]:
from reportlab.pdfgen import canvas
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics
from reportlab.lib.pagesizes import letter
import arabic_reshaper
from bidi.algorithm import get_display
from pdf2image import convert_from_path
import json
from tqdm.notebook import tqdm
import numpy as np
from shapely.geometry import box

In [8]:
pdf_images = convert_from_path('../../data/constitution_pdfs/constitution_urdu.pdf')
page_sizes = [img.size for img in pdf_images]

In [9]:
with open('../../data/english_to_urdu_translations/urdu_ocr_results.json', 'r') as f:
    ocr_detections = json.load(f)

ocr_detections[0]

{'page_no': 0,
 'text': 'اسلامی جمہوریہ پاکستان',
 'bbox': [290.6603088378906,
  621.9564208984375,
  1069.901611328125,
  822.6404418945312]}

# Detection Processing to remove same word detections

In [13]:
def bbox_iou(box1, box2):
    b1 = box(*box1)
    b2 = box(*box2)

    inter_area = b1.intersection(b2).area
    union_area = b1.union(b2).area

    iou = inter_area / union_area if union_area != 0 else 0
    return iou

def process_page_detections(detections, thresh):
    unique_detections = []
    seen_texts = {}
    
    for detection in detections:
        text = detection["text"]
        bbox = detection["bbox"]
        
        if text not in seen_texts:
            seen_texts[text] = []
        
        to_add = True
        for existing_bbox in seen_texts[text]:
            if bbox_iou(bbox, existing_bbox) > thresh:
                to_add = False
                break
        
        if to_add:
            seen_texts[text].append(bbox)
            unique_detections.append(detection)
    
    return unique_detections

def process_detections(ocr_detections, thresh=0.7):
    page_detections = {}

    for detection in ocr_detections:
        page_no = detection["page_no"]

        try:
            page_detections[page_no].append(detection)
        except:
            page_detections[page_no] = [detection]

    processed_detections = []

    for page_no, detections in tqdm(page_detections.items()):
        processed_detections.extend(process_page_detections(detections, thresh))
    
    return processed_detections

In [91]:
processed_detections = process_detections(ocr_detections, thresh=0.2)

  0%|          | 0/253 [00:00<?, ?it/s]

In [92]:
len(ocr_detections), len(processed_detections)

(9317, 8281)

In [93]:
bbox_iou([
      1247.1463623046875,
      1236.151123046875,
      1303.7264404296875,
      1295.27734375
    ], [
      1237.7386474609375,
      1230.549072265625,
      1316.587890625,
      1305.21875
    ])

0.5682008209663667

# PDF reconstruction

In [94]:
def draw_arabic_text(c, text, bbox, page_size):
    reshaped_text = arabic_reshaper.reshape(text)
    display_text = get_display(reshaped_text)
    x0, y0, x1, y1 = bbox
    width, height = page_size
    
    # Convert coordinates from image space to PDF space
    x0 = x0 / width * letter[0]
    y0 = (height - y1) / height * letter[1]

    c.setFont('NotoNaskh', 12)
    c.drawString(x0, y0, display_text)

In [101]:
OUTPUT_PATH = "../../data/constitution_pdfs/urdu_reconstructed/constitution_urdu_reconstructed.pdf"

In [102]:
# Register a font that supports Arabic
pdfmetrics.registerFont(
    TTFont('NotoNaskh', '../../data/fonts/NotoNaskhArabic-Regular.ttf'))
pdfmetrics.registerFont(
    TTFont('NotoNastaliq', '../../data/fonts/NotoNastaliqUrdu-Regular.ttf'))

# Create the PDF
c = canvas.Canvas(OUTPUT_PATH, pagesize=letter)

# Set the font to the one that supports Arabic
c.setFont('NotoNaskh', 20)

In [103]:
page_wise_splits = {}

for det in processed_detections:
    try:
        page_wise_splits[det['page_no']].append(det)
    except:
        page_wise_splits[det['page_no']] = [det]

In [104]:
PADDING = 20
padded_page_size = (letter[0] + PADDING, letter[1] + PADDING)

In [105]:
for page_no in tqdm(range(len(page_sizes))):
    c.setPageSize(padded_page_size)
    
    try:
        dets = page_wise_splits[page_no]

        for det in dets:
            draw_arabic_text(c, det['text'], det['bbox'], page_sizes[page_no])
    except:
        pass
    finally:
        c.showPage()

  0%|          | 0/257 [00:00<?, ?it/s]

In [106]:
# Finish the PDF
c.save()