In [7]:
import os
from pdf2image import convert_from_path
from pytesseract import image_to_string
from pathlib import Path
import pytesseract


In [8]:
# Set the path to the Tesseract executable (adjust this according to your installation)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [10]:
# Function to convert PDF to images and extract text using Tesseract
def extract_text_from_pdf(pdf_path, languages='urd+ben+eng+chi_sim'):
    # Convert PDF to images
    images = convert_from_path(pdf_path)
    
    # Initialize a string to store the extracted text
    text = ""
    
    # Extract text from each image using Tesseract
    for image in images:
        text += pytesseract.image_to_string(image, lang=languages)
    
    return text


In [11]:
# Function to process PDFs in multiple folders (en, ur, bn, zh) and extract text
def extract_text_from_multiple_folders(root_directory):
    extracted_data = {}
    
    # List of language folder names
    language_folders = ['en', 'ur', 'bn', 'zh']
    
    # Iterate over each language folder
    for language in language_folders:
        folder_path = os.path.join(root_directory, language)
        if os.path.exists(folder_path):
            language_data = []
            
            # Iterate over each PDF file in the folder
            for filename in os.listdir(folder_path):
                if filename.endswith(".pdf"):
                    pdf_path = os.path.join(folder_path, filename)
                    print(f"Processing {filename} in {language} folder...")
                    
                    # Extract text from the PDF
                    text = extract_text_from_pdf(pdf_path)
                    
                    # Store the extracted text along with the filename
                    language_data.append({"filename": filename, "text": text})
            
            # Add the extracted data for the language to the result dictionary
            extracted_data[language] = language_data
    
    return extracted_data

In [12]:
# Example: Process a single PDF file
root_directory = 'sample_pdfs'  
extracted_text = extract_text_from_multiple_folders(root_directory)

Processing Blue_Ocean_Strategy,_Expanded_Edition_How_to_Create_Uncontested-2.pdf in en folder...
Processing Reboot_Leadership_and_the_Art_of.pdf in en folder...
Processing The Alchemist by Paulo Coelho-1.pdf in en folder...
Processing 12-Rabiul-Awal-2024.pdf in ur folder...
Processing Extension-of-Ahdoc-Employees.pdf in ur folder...
Processing fasana-e-ajaib final.pdf in ur folder...
Processing Notification-for-Other-Nationals.pdf in ur folder...
Processing shora e rampur.pdf in ur folder...
Processing Solidarity-Day.pdf in ur folder...
Processing حیات جاوید، سوانح سر سید احمد خاں.pdf in ur folder...
Processing 15092024_142.pdf in bn folder...
Processing 471 (TO).pdf in bn folder...
Processing AP Ramjan.pdf in bn folder...
Processing NEC-14.pdf in bn folder...
Processing Research Nirdeshika.pdf in bn folder...
Processing আহম্মেদNOC.pdf in bn folder...
Processing 1553a07b-9f53-4e8b-9987-ae714000b95b.pdf in zh folder...
Processing 98aab034-f8d7-4f6e-9a0c-b52c12f55ce7.pdf in zh folder...


In [13]:
import json

with open('extracted_text_data.json', 'w', encoding='utf-8') as json_file:
        json.dump(extracted_text, json_file, ensure_ascii=False, indent=4)
    
print(f"Extracted data saved to...")


Extracted data saved to...
