In [11]:
import os
import pymupdf  # pymupdf
import pytesseract
from PIL import Image
from io import BytesIO
import time
import logging
from google.api_core.exceptions import InternalServerError
from dotenv import load_dotenv

In [7]:
load_dotenv()

# Get the GOOGLE_API_KEY from the .env file
API_KEY = os.getenv('GOOGLE_API_KEY')

# Configure genai with the API key
genai.configure(api_key=API_KEY)
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [15]:
def ocr_page(page_pix, lang='ben'):
    img = Image.open(BytesIO(page_pix.tobytes("png")))
    text = pytesseract.image_to_string(img, lang=lang)
    return text


def extract_text_from_pdf(pdf_path, lang='ben'):
    doc = pymupdf.open(pdf_path)
    full_text = ""

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)  # load page
        text = page.get_text("text")  # extract text

        if not text.strip():  # if no text is found
            # Convert PDF page to image for OCR without saving
            pix = page.get_pixmap()
            text = ocr_page(pix, lang=lang)

        full_text += text + "\n"

    return full_text.strip()

In [16]:
def process_pdf_folder(input_folder, output_folder, language='ben', retries=3):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(input_folder, filename)
            print(f"Processing PDF: {pdf_path}...")

            text = extract_text_from_pdf(pdf_path, lang=language)

            output_filename = os.path.splitext(filename)[0] + '.txt'
            output_path = os.path.join(output_folder, output_filename)

            # Save the extracted text to a file
            with open(output_path, 'w', encoding='utf-8') as file_out:
                file_out.write(text)

            print(f"Saved text to {output_path}")

            # Optional: Use the generative model to process the text
            try:
                model = genai.GenerativeModel(model_name="gemini-1.5-pro")
                response = model.generate_content([text, "Summarize the extracted text."])
                summary_output = os.path.splitext(output_filename)[0] + '_summary.txt'
                summary_output_path = os.path.join(output_folder, summary_output)
                
                # Save the summary to a file
                with open(summary_output_path, 'w', encoding='utf-8') as summary_out:
                    summary_out.write(response.text)
                print(f"Saved summary to {summary_output_path}")
            except InternalServerError as e:
                logging.warning(f"Error processing file {filename}: {e}")
                continue


In [17]:
input_folder = r'D:\SEM 5\RAG\sample_pdfs\bn'
output_folder = r'D:\SEM 5\RAG\sample_pdfs\output'

process_pdf_folder(input_folder, output_folder, language='ben')

Processing PDF: D:\SEM 5\RAG\sample_pdfs\bn\15092024_142.pdf...
Saved text to D:\SEM 5\RAG\sample_pdfs\output\15092024_142.txt
Saved summary to D:\SEM 5\RAG\sample_pdfs\output\15092024_142_summary.txt
Processing PDF: D:\SEM 5\RAG\sample_pdfs\bn\471 (TO).pdf...
Saved text to D:\SEM 5\RAG\sample_pdfs\output\471 (TO).txt




Processing PDF: D:\SEM 5\RAG\sample_pdfs\bn\AP Ramjan.pdf...
Saved text to D:\SEM 5\RAG\sample_pdfs\output\AP Ramjan.txt
Saved summary to D:\SEM 5\RAG\sample_pdfs\output\AP Ramjan_summary.txt
Processing PDF: D:\SEM 5\RAG\sample_pdfs\bn\NEC-14.pdf...
Saved text to D:\SEM 5\RAG\sample_pdfs\output\NEC-14.txt
Saved summary to D:\SEM 5\RAG\sample_pdfs\output\NEC-14_summary.txt
Processing PDF: D:\SEM 5\RAG\sample_pdfs\bn\Research Nirdeshika.pdf...
Saved text to D:\SEM 5\RAG\sample_pdfs\output\Research Nirdeshika.txt


ResourceExhausted: 429 Resource has been exhausted (e.g. check quota).