In [1]:
#Dependencies
import numpy as np
import pandas as pd
from PIL import Image, ImageSequence
from pdf2image import convert_from_path, pdfinfo_from_path
import logging as logger
import os
import re
import pytesseract
import bs4

In [2]:
#Configs
raw_folder = "/Users/sali115/text_mapping/data/raw"
doc_name = "Exclusions - non-VIP 5.2.pdf"
file_name = os.path.join(raw_folder, doc_name)
processed_folder = "/Users/sali115/text_mapping/data/processed"

In [185]:
#Utils

def rename_images(image_path, folder_path):
    try:
        page_num = int(re.findall(r"(\d+)\.[a-z]{3}$", image_path)[0])
        new_path = os.path.join(folder_path, f"page{page_num}.png")
        os.rename(image_path, new_path)
        doc_name = folder_path.rstrip("/").split("/")[-1]
        logger.info(f"Saved Image Number {page_num} of {doc_name} at {new_path}")
    except Exception as e:
        logger.error(f"Error in renaming image: {e}")
        
def ocr_hocr(img_path, psm=4):
    return pytesseract.image_to_pdf_or_hocr(
        img_path, extension="hocr", config=f"--psm {psm}"
    )

def get_text_from_hocr(hocr):
    hocr_text = []
    soup = bs4.BeautifulSoup(hocr, "html.parser")
    for line in soup.select(".ocr_line"):
        line_text = re.sub(r"\s+", " ", line.text).strip()
        alphanum_text = re.sub("[^a-zA-Z0-9 \n]", "", line_text).strip()
        if alphanum_text != "":
            hocr_text.append(line_text)
    return "\n".join(hocr_text)

def get_ocr_df(page_num, hocr):
    page_dict = {
        "page": '$page_num',
        "line": '$line',
        "text": '$text',
    }
    line = 1
    text = get_text_from_hocr(hocr).split(".")
    text = [re.sub(r"\n\d+", "", txt).replace("\n", " ") for txt in text]
    text = list(filter(None, text))
    
    page_record = list()
    
    for txt in text:
        if txt.isdigit():
            continue
        line += 1
        page_dict["page"] = page_num
        page_dict["line"] = line
        page_dict["text"] = txt
        page_record.append(page_dict.copy())
    return pd.DataFrame.from_records(page_record)

In [186]:
# # def fltr(txt):
# #     return re.sub(r"\\n[\d+]", " ", txt)
# list(filter(lambda x: re.sub(r"\n\d+", " ", x),text))

In [187]:
# text

In [188]:
# text = get_text_from_hocr(hocr).split(".")
# text = [re.sub(r"\n\d+", "", txt) for txt in text]
# text = list(filter(None, text))
# text

In [189]:
# list(filter(None, text))

In [190]:
# get_text_from_hocr(hocr)

In [191]:
#File Splitting - Doc pre-processing

logger.info(f"Splitting pdf {doc_name} into PNGs")
output_folder = os.path.join(processed_folder, doc_name.split(".")[0])
os.makedirs(output_folder, exist_ok=True)
num_pages = pdfinfo_from_path(file_name)['Pages']
logger.info(
    f"Found {num_pages} pages of {doc_name} to split into PNGs")

image_paths = convert_from_path(file_name,
                                output_folder=output_folder,
                                paths_only=True,
                                fmt="png",
                                output_file="page")
for image_path in image_paths:
    rename_images(image_path, output_folder)

In [192]:
#Getting OCR Results
ocr_df = pd.DataFrame(columns= ["page","line","text", "CODE"])

for page_path in os.listdir(output_folder):
    page_num = int(re.findall(r'\d+', page_path)[0])
    page_img = os.path.join(output_folder, page_path)
    hocr = ocr_hocr(page_img).decode(encoding="utf-8", errors="ignore")
    ocr_df = pd.concat([ocr_df, get_ocr_df(page_num, hocr)])


In [194]:
ocr_df[ocr_df["page"]==2]

Unnamed: 0,page,line,text,CODE
0,2,2,Emirate of Abu Dhabi Non-VIP Exclusions,
1,2,3,"Healthcare Services, which are not medically ...",
2,2,4,"All expenses relating to dental treatment, de...",
3,2,5,Domiciliary care; private nursing care; care ...,
4,2,6,Custodial care includes a,
5,2,7,Non-medical treatment services; or b,
6,2,8,Health-related services which do not seek to ...,
7,2,9,Services which do not require continuous admi...,
8,2,10,Personal comfort and convenience items (telev...,
9,2,11,Healthcare Services and associated expenses f...,
