In [38]:
import pymupdf
import pytesseract
from pdfminer.high_level import extract_text
from PIL import Image, ImageOps, ImageFilter
import cv2  # For multilingual OCR
import os
import numpy as np
import fitz

In [39]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [40]:
doc = pymupdf.open(r"D:\SEM 5\RAG\sample_pdfs\en\Blue_Ocean_Strategy,_Expanded_Edition_How_to_Create_Uncontested-2.pdf") 
out = open("output.txt", "wb")
for page in doc: 
    text = page.get_text().encode("utf8") 
    out.write(text) 
    out.write(bytes((12,)))
out.close()

In [47]:
def extract_text_from_pdf(pdf_path, language = 'eng'):
    try:
        doc = pymupdf.open(pdf_path)
        text = ""
        for page in doc:
            page_text = page.get_text()
            text += page_text
            if page_text:
                text += page_text
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None
    


In [48]:
# Process the PDFs

def process_pdf_folder(input_folder, output_folder, language = 'eng'):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(input_folder, filename)
            print(f"Found PDF: {pdf_path}...")

            text = extract_text_from_pdf(pdf_path)

            if text is None:
                print(f'No text found in {filename}, Applying OCR using pytesseract...')
                text = ocr_from_pdf(pdf_path, language)

            output_filename = os.path.splitext(filename)[0] + '.txt'
            output_path = os.path.join(output_folder, output_filename)

            with open(output_path, 'w', encoding='utf-8') as file_out:
                file_out.write(text)

            print(f"Saved text to {output_path}")

In [49]:
# English PDFs

eng_pdfs= r"D:\SEM 5\RAG\sample_pdfs\en"
output_folder_en = r"D:\SEM 5\RAG\converted_files\en"

process_pdf_folder(eng_pdfs, output_folder_en, language = 'eng')

Found PDF: D:\SEM 5\RAG\sample_pdfs\en\Blue_Ocean_Strategy,_Expanded_Edition_How_to_Create_Uncontested-2.pdf...
Saved text to D:\SEM 5\RAG\converted_files\en\Blue_Ocean_Strategy,_Expanded_Edition_How_to_Create_Uncontested-2.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\en\Reboot_Leadership_and_the_Art_of.pdf...
Saved text to D:\SEM 5\RAG\converted_files\en\Reboot_Leadership_and_the_Art_of.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\en\The Alchemist by Paulo Coelho-1.pdf...
Saved text to D:\SEM 5\RAG\converted_files\en\The Alchemist by Paulo Coelho-1.txt


In [50]:
# Bengali PDFs

bengali_pdfs= r"D:\SEM 5\RAG\sample_pdfs\bn"
output_folder_bn = r"D:\SEM 5\RAG\converted_files\bn"

process_pdf_folder(bengali_pdfs, output_folder_bn, language = 'ben')

Found PDF: D:\SEM 5\RAG\sample_pdfs\bn\15092024_142.pdf...
Saved text to D:\SEM 5\RAG\converted_files\bn\15092024_142.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\bn\471 (TO).pdf...
Saved text to D:\SEM 5\RAG\converted_files\bn\471 (TO).txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\bn\AP Ramjan.pdf...
Saved text to D:\SEM 5\RAG\converted_files\bn\AP Ramjan.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\bn\NEC-14.pdf...
Saved text to D:\SEM 5\RAG\converted_files\bn\NEC-14.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\bn\Research Nirdeshika.pdf...
Saved text to D:\SEM 5\RAG\converted_files\bn\Research Nirdeshika.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\bn\আহম্মেদNOC.pdf...
Saved text to D:\SEM 5\RAG\converted_files\bn\আহম্মেদNOC.txt


In [51]:
# Urdu PDFs

urdu_pdfs= r"D:\SEM 5\RAG\sample_pdfs\ur"
output_folder_ur = r"D:\SEM 5\RAG\converted_files\ur"

process_pdf_folder(urdu_pdfs, output_folder_ur, language = 'urd')

Found PDF: D:\SEM 5\RAG\sample_pdfs\ur\12-Rabiul-Awal-2024.pdf...
Saved text to D:\SEM 5\RAG\converted_files\ur\12-Rabiul-Awal-2024.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\ur\Extension-of-Ahdoc-Employees.pdf...
Saved text to D:\SEM 5\RAG\converted_files\ur\Extension-of-Ahdoc-Employees.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\ur\fasana-e-ajaib final.pdf...
Saved text to D:\SEM 5\RAG\converted_files\ur\fasana-e-ajaib final.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\ur\Notification-for-Other-Nationals.pdf...
Saved text to D:\SEM 5\RAG\converted_files\ur\Notification-for-Other-Nationals.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\ur\shora e rampur.pdf...
Saved text to D:\SEM 5\RAG\converted_files\ur\shora e rampur.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\ur\Solidarity-Day.pdf...
Saved text to D:\SEM 5\RAG\converted_files\ur\Solidarity-Day.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\ur\حیات جاوید، سوانح سر سید احمد خاں.pdf...
Saved text to D:\SEM 5\RAG\converted_files\ur\حیات جاوید، سوانح سر سید احمد خا

In [52]:
# Chinese PDFs

chinese_pdfs= r"D:\SEM 5\RAG\sample_pdfs\zh"
output_folder_zh = r"D:\SEM 5\RAG\converted_files\zh"

process_pdf_folder(chinese_pdfs, output_folder_zh, language = 'chi_sim')


Found PDF: D:\SEM 5\RAG\sample_pdfs\zh\1553a07b-9f53-4e8b-9987-ae714000b95b.pdf...
Saved text to D:\SEM 5\RAG\converted_files\zh\1553a07b-9f53-4e8b-9987-ae714000b95b.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\zh\98aab034-f8d7-4f6e-9a0c-b52c12f55ce7.pdf...
Saved text to D:\SEM 5\RAG\converted_files\zh\98aab034-f8d7-4f6e-9a0c-b52c12f55ce7.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\zh\ec2def2f-cc7b-44f3-87d1-24dc82f3a0ca.pdf...
Saved text to D:\SEM 5\RAG\converted_files\zh\ec2def2f-cc7b-44f3-87d1-24dc82f3a0ca.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\zh\P020230313555181904759.pdf...
Saved text to D:\SEM 5\RAG\converted_files\zh\P020230313555181904759.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\zh\P020230907694757200665.pdf...
Saved text to D:\SEM 5\RAG\converted_files\zh\P020230907694757200665.txt
Found PDF: D:\SEM 5\RAG\sample_pdfs\zh\P020230907695746624812.pdf...
Saved text to D:\SEM 5\RAG\converted_files\zh\P020230907695746624812.txt
