In [None]:
#!pip install pytesseract

In [43]:
import os
import base64
from openai import OpenAI
import fitz
import re
import pandas as pd
from img2table.document import PDF
from img2table.ocr import TesseractOCR
import json
import base64
import torch
from transformers import AutoProcessor, VisionEncoderDecoderModel
from PIL import Image
import fitz
import os
from pathlib import Path
import pytesseract
from logger import Logger
import warnings
warnings.filterwarnings('ignore')
from generic import GenericFunction
import ssl
import certifi
os.environ['CURL_CA_BUNDLE'] = './certificates/huggingface.co.pem'
ssl._create_default_https_context = ssl.create_default_context(cafile=certifi.where())

pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract'

class PDFProcessor:
    
    def __init__(self, file_mapping, openai_key,source_pdf_path="source_data", output_folder="data_parsed", images_folder="images", page_range=None):
        """
        Initializes the PDFProcessor class.
        """
        self.generic=GenericFunction
        self.logger = Logger()
        self.file_mapping = file_mapping
        self.source_pdf_path = source_pdf_path
        self.output_folder = output_folder
        self.page_range = page_range
        self.images_folder = images_folder
        self.page_range=page_range
        self.processor = AutoProcessor.from_pretrained("facebook/nougat-base")
        self.model = VisionEncoderDecoderModel.from_pretrained("facebook/nougat-base")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)
        self.api_key = openai_key
        self.client = OpenAI(api_key=self.api_key)
        self.prompt= self.read_prompt("./prompts/pdf_parsing.txt")
        self.logger.log_info("The prompt is {}".format(self.prompt))


    def read_prompt(self,file_path):
        with open(file_path, 'r') as file:
            return file.read().strip()
            
    def get_or_download_pdfs(self):
        """
        Checks if all PDFs and their OCR versions exist locally. Downloads PDFs if not available.
        """
        os.makedirs(self.source_pdf_path, exist_ok=True)
        original_files = []
        ocr_files = []
    
        for mapping in self.file_mapping:
            for url, file_name in mapping.items():
                local_path = os.path.join(self.source_pdf_path, file_name)
                ocr_path = os.path.join(self.source_pdf_path, file_name.replace(".pdf", "_OCR.pdf"))
    
                if os.path.exists(ocr_path):
                    self.logger.log_info(f"OCR version of the PDF exists locally at: {ocr_path}")
                    ocr_files.append(ocr_path)
                else:
                    self.logger.log_info(f"OCR version not found for: {file_name}. Ensure OCR processing is done.")
    
                if os.path.exists(local_path):
                    self.logger.log_info(f"Original PDF already exists locally at: {local_path}")
                    original_files.append(local_path)
                else:
                    self.logger.log_info(f"Downloading PDF from URL: {url}")
                    response = requests.get(url)
                    if response.status_code == 200:
                        with open(local_path, "wb") as f:
                            f.write(response.content)
                        self.logger.log_info(f"Downloaded PDF to: {local_path}")
                        original_files.append(local_path)
                    else:
                        self.logger.log_error(f"Failed to download PDF from {url}")
                        raise Exception(f"Failed to download PDF. HTTP Status Code: {response.status_code}")
    
        return original_files, ocr_files


    def clean_text(self, text):
        """
        Cleans the input text by joining lines and handling non-string inputs.
        """
        if not isinstance(text, str):
            return str(text) if text is not None else "" 
        lines = text.split("\n")
        cleaned_lines = []
        buffer = ""  
        for line in lines:
            line = line.strip()
            if not line:
                continue
            if re.search(r"[.!?]$", line) or line.startswith("*"):
                if buffer:
                    cleaned_lines.append(buffer.strip())
                    buffer = ""
                cleaned_lines.append(line)
            else:
                buffer += " " + line
        if buffer:
            cleaned_lines.append(buffer.strip())
        return "\n".join(cleaned_lines)


    def dataframe_json_to_continuous_text(self,json_list):
        """
        Converts a list of JSON objects (representing DataFrame tables) into continuous text.
        """
        continuous_text = []
        for idx, table_json in enumerate(json_list, start=1):
            if isinstance(table_json, str):
                try:
                    table_json = json.loads(table_json)
                except json.JSONDecodeError:
                    raise ValueError(f"Invalid JSON string at index {idx}: {table_json}")
            if not isinstance(table_json, list) or not all(isinstance(row, dict) for row in table_json):
                raise ValueError(f"Invalid table format at index {idx}: Expected a list of dictionaries.")
            table_label = f"Table {idx}: "
            row_texts = []   
            for row in table_json:
                row_text = ", ".join(f"{key} is {value}" for key, value in row.items())
                row_texts.append(row_text)
            combined_table_text = table_label + " | ".join(row_texts)
            continuous_text.append(combined_table_text)
        return "\n".join(continuous_text)


    def encode_image(self,image_path):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')

    def analyze_image_with_openai(self,image_path, openai_model="gpt-4o-mini"):
        """
        Analyzes the content of an image using OpenAI GPT model.
        """
        base64_image = self.encode_image(image_path)
        response = self.client.chat.completions.create(
            model=openai_model,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": self.prompt},
                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
                    ],
                }
            ],
            max_tokens=500,
        )
        return response.choices[0].message.content

    def extract_images(self,pdf_path, output_folder="images", dpi=300):
        """
        Extracts full-page and sub-images from a PDF and organizes them in a structured folder.
        """
        pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
        pdf_folder = os.path.join(output_folder, pdf_name)
        os.makedirs(pdf_folder, exist_ok=True)
        doc = fitz.open(pdf_path)
        image_details = {}
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            page_folder = os.path.join(pdf_folder, f"page-{page_num + 1}")
            os.makedirs(page_folder, exist_ok=True)
            pix = page.get_pixmap(dpi=dpi)
            full_page_image_path = os.path.join(page_folder, "full_page_image.png")
            pix.save(full_page_image_path)
            sub_image_paths = []
            for img_index, img in enumerate(page.get_images(full=True), start=1):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                ext = base_image["ext"]
    
                # Save the sub-image with a detailed name
                sub_image_name = f"page_{page_num + 1}_sub_image_{img_index}.{ext}"
                sub_image_path = os.path.join(page_folder, sub_image_name)
                with open(sub_image_path, "wb") as img_file:
                    img_file.write(image_bytes)
                sub_image_paths.append(os.path.relpath(sub_image_path, start=os.getcwd()))
            image_details[f"page-{page_num + 1}"] = {
                "full_page_image": os.path.relpath(full_page_image_path, start=os.getcwd()),
                "sub_images": sub_image_paths,
            }
    
        return image_details


    def extract_text_from_image(self,image_path):
        image = Image.open(image_path)
        pixel_values = self.processor(images=image, return_tensors="pt").pixel_values.to(self.device)
        outputs = self.model.generate(pixel_values, max_length=512, early_stopping=True)
        generated_text = self.processor.batch_decode(outputs, skip_special_tokens=True)[0]
        return generated_text.strip()

    def extract_text_from_page(self,pdf_path, page_num):
        image_path = self.rasterize_page(pdf_path, page_num)
        extracted_text = self.extract_text_from_image(image_path)
        return extracted_text

    def save_tables_to_excel(self,pdf_path, excel_path="tables.xlsx"):
        """
        Extract all tables from the PDF and save them into an Excel file.
        """
        pdf = PDF(src=pdf_path)
        ocr = TesseractOCR(lang="eng")
        pdf.to_xlsx(excel_path, ocr=ocr)


    def extract_tables_from_excel(self,excel_path, page_num):
        """
        Extract and clean all tables for a specific page from the Excel file.
        """
        tables = []
        page_name = f"Page {page_num}"
        with pd.ExcelFile(excel_path) as xls:
            for sheet_name in xls.sheet_names:
                if sheet_name.split(" - ")[0].strip() == page_name:
                    df = pd.read_excel(xls, sheet_name=sheet_name)
                    df = df.applymap(self.clean_text)
                    tables.append(df.to_json(orient="records"))
        return tables
    
    def extract_page_title_from_header(self,page, y_threshold=100):
        blocks = page.get_text("dict")["blocks"]
        top_texts = []
        for block in blocks:
            if "lines" in block and block["bbox"][1] < y_threshold:
                for line in block["lines"]:
                    for span in line["spans"]:
                        top_texts.append(span["text"])
        full_title = " ".join(top_texts).strip()
        cleaned_title = re.sub(r"[^a-zA-Z0-9\s&]", "", full_title)
        words = cleaned_title.split()
        if len(words) > 1:
            if words[1].lower() in {"and", "&"} and len(words) > 2:
                return f"{words[0]} {words[1]} {words[2]}"
            return f"{words[0]} {words[1]}"
        return words[0] if words else f"Page {page.number + 1}"


    def rasterize_page(self, pdf_path, page_num, dpi=96):
        """
        Converts a page to an image and saves it under the respective PDF folder.
        """
        pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
        pdf_folder = os.path.join(self.images_folder, pdf_name)
        os.makedirs(pdf_folder, exist_ok=True)
        page_folder = os.path.join(pdf_folder, f"page-{page_num + 1}")
        os.makedirs(page_folder, exist_ok=True)
    
        # Generate the full-page image
        pdf = fitz.open(pdf_path)
        page = pdf.load_page(page_num)
        pix = page.get_pixmap(dpi=dpi)
        full_page_image_path = os.path.join(page_folder, "rasterized_full_page_image.png")
        pix.save(full_page_image_path)
        return full_page_image_path

    def get_text_tesseract(self,filename):
        image = Image.open(filename)
        text = pytesseract.image_to_string(image)
        return self.clean_text(text)
        
    
    def process_pdf(self,pdf_path,ocr_pdf,excel_path,image_pdf_path="images", ):
        """
        Processes a PDF and extracts metadata, text, tables, and images into a DataFrame.
        """
    
        doc = fitz.open(pdf_path)
        metadata = doc.metadata
        doc_title = metadata.get("title", "Unknown Title")
        doc_author = metadata.get("author", "Unknown Author")
    
        image_details = self.extract_images(pdf_path, output_folder=image_pdf_path)
        self.save_tables_to_excel(pdf_path, excel_path)
    
        page_data = []
        if self.page_range:
            page_num,total_pages=self.page_range
        else:
            page_num=0
            total_pages=len(doc)
        for page_num in range(total_pages):
            self.logger.log_info("Ruuning for PDF : {} and page number {}".format(pdf_path,page_num))
            page = doc[page_num]
            page_title = self.extract_page_title_from_header(page)  
            page_tables = self.extract_tables_from_excel(excel_path, page_num + 1)
            extracted_text = self.extract_text_from_page(ocr_pdf, page_num)
            page_image_details = image_details.get(f"page-{page_num + 1}", {})
            full_page_image = page_image_details.get("full_page_image", "")
            extracted_text_tesseract=self.get_text_tesseract(full_page_image)
            sub_images = page_image_details.get("sub_images", [])
            image_description=""
            if len(sub_images)>0:
                self.logger.log_info("Getting image description for path {}".format(full_page_image))
                image_description=self.analyze_image_with_openai(full_page_image)
                #print("The image description is {}".format(image_description))
            all_images = [full_page_image] + sub_images
            table_description=self.dataframe_json_to_continuous_text(page_tables)
            combined_text = f"{extracted_text_tesseract}\n\n"
            combined_text += f"Image Details:\n{image_description}\n\n" if image_description else ""
            combined_text += f"Table Details:\n{table_description}" if table_description else ""
            page_data.append({
                "PDF Name": os.path.basename(pdf_path),
                "Page Number": page_num + 1,
                "Page Title": page_title,
                "Combined Text":combined_text,
                "Page Text": extracted_text,
                "Page Text(Tesseract)": extracted_text_tesseract,
                "All Image Paths": all_images,
                "Full Page Image Path": full_page_image,
                "Sub Image Paths": sub_images,
                "Image Description": image_description,
                "Table JSON": page_tables,
                "Table Description": table_description
            })
    
        df = pd.DataFrame(page_data)
        return df



def main():
    combined_data = []
    from generic import GenericFunction
    generic = GenericFunction()
    file_mapping = generic.get_value("file_mappings")
    open_ai_key = generic.get_value("api_keys")['openai']
    page_range = None 
    processor = PDFProcessor(file_mapping,open_ai_key, page_range=page_range)
    original_files, ocr_files = processor.get_or_download_pdfs()
    for original_pdf, ocr_pdf in zip(original_files, ocr_files):
        excel_path = f"data_parsed/{os.path.basename(original_pdf).replace('.pdf', '_tables.xlsx')}"
        pdf_df = processor.process_pdf(original_pdf, ocr_pdf, excel_path,"./images")
        combined_data.append(pdf_df)   
    final_combined_df = pd.concat(combined_data, ignore_index=True)
    output_path = "data_parsed/Combined_Output.csv"
    final_combined_df.to_csv(output_path, index=False)
    print(f"Processed all PDFs and saved consolidated output: {output_path}")

In [44]:
import os
if __name__ == "__main__":
    main()

2024-11-24 16:32:12,874 [INFO] Loaded config.json from: /Users/deepakn/Desktop/PDF-RAG/config.json
Config of the encoder: <class 'transformers.models.donut.modeling_donut_swin.DonutSwinModel'> is overwritten by shared encoder config: DonutSwinConfig {
  "attention_probs_dropout_prob": 0.0,
  "depths": [
    2,
    2,
    14,
    2
  ],
  "drop_path_rate": 0.1,
  "embed_dim": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": [
    896,
    672
  ],
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "mlp_ratio": 4.0,
  "model_type": "donut-swin",
  "num_channels": 3,
  "num_heads": [
    4,
    8,
    16,
    32
  ],
  "num_layers": 4,
  "patch_size": 4,
  "qkv_bias": true,
  "transformers_version": "4.47.0.dev0",
  "use_absolute_embeddings": false,
  "window_size": 7
}

Config of the decoder: <class 'transformers.models.mbart.modeling_mbart.MBartForCausalLM'> is overwritten by shared decoder config: MBartConfig {
  "activation_dropo

tesseract 5.5.0
 leptonica-1.85.0
  libgif 5.2.2 : libjpeg 8d (libjpeg-turbo 3.0.4) : libpng 1.6.44 : libtiff 4.7.0 : zlib 1.2.12 : libwebp 1.4.0 : libopenjp2 2.5.2
 Found NEON
 Found libarchive 3.7.7 zlib/1.2.12 liblzma/5.6.3 bz2lib/1.0.8 liblz4/1.10.0 libzstd/1.5.6
 Found libcurl/8.6.0 SecureTransport (LibreSSL/3.3.6) zlib/1.2.12 nghttp2/1.61.0


2024-11-24 16:33:12,045 [INFO] Ruuning for PDF : source_data/Astor Manual.pdf and page number 0
2024-11-24 16:33:25,963 [INFO] Getting image description for path images/Astor Manual/page-1/full_page_image.png
2024-11-24 16:33:31,599 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-24 16:33:31,621 [INFO] Ruuning for PDF : source_data/Astor Manual.pdf and page number 1
2024-11-24 16:33:45,845 [INFO] Ruuning for PDF : source_data/Astor Manual.pdf and page number 2
2024-11-24 16:33:53,131 [INFO] Ruuning for PDF : source_data/Astor Manual.pdf and page number 3
2024-11-24 16:34:06,693 [INFO] Ruuning for PDF : source_data/Astor Manual.pdf and page number 4
2024-11-24 16:34:21,578 [INFO] Ruuning for PDF : source_data/Astor Manual.pdf and page number 5
2024-11-24 16:34:37,191 [INFO] Ruuning for PDF : source_data/Astor Manual.pdf and page number 6
2024-11-24 16:34:52,654 [INFO] Ruuning for PDF : source_data/Astor Manual.pdf and page number 7
2024-11-

tesseract 5.5.0
 leptonica-1.85.0
  libgif 5.2.2 : libjpeg 8d (libjpeg-turbo 3.0.4) : libpng 1.6.44 : libtiff 4.7.0 : zlib 1.2.12 : libwebp 1.4.0 : libopenjp2 2.5.2
 Found NEON
 Found libarchive 3.7.7 zlib/1.2.12 liblzma/5.6.3 bz2lib/1.0.8 liblz4/1.10.0 libzstd/1.5.6
 Found libcurl/8.6.0 SecureTransport (LibreSSL/3.3.6) zlib/1.2.12 nghttp2/1.61.0


2024-11-24 17:28:51,879 [INFO] Ruuning for PDF : source_data/APP-TIAGO-FINAL-OMSB.pdf and page number 0
2024-11-24 17:29:07,018 [INFO] Getting image description for path images/APP-TIAGO-FINAL-OMSB/page-1/full_page_image.png
2024-11-24 17:29:16,657 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-24 17:29:16,675 [INFO] Ruuning for PDF : source_data/APP-TIAGO-FINAL-OMSB.pdf and page number 1
2024-11-24 17:29:30,479 [INFO] Getting image description for path images/APP-TIAGO-FINAL-OMSB/page-2/full_page_image.png
2024-11-24 17:29:35,615 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-24 17:29:35,618 [INFO] Ruuning for PDF : source_data/APP-TIAGO-FINAL-OMSB.pdf and page number 2
2024-11-24 17:29:42,659 [INFO] Getting image description for path images/APP-TIAGO-FINAL-OMSB/page-3/full_page_image.png
2024-11-24 17:29:49,256 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 

Processed all PDFs and saved consolidated output: data_parsed/Combined_Output.csv
