In [None]:
!pip install -U pypdfium2
!pip install pytesseract

In [None]:
import pypdfium2 as pdfium
from PIL import Image, ImageEnhance, ImageFilter
import pytesseract
from io import BytesIO
import matplotlib.pyplot as plt

In [None]:
pytesseract.pytesseract.tesseract_cmd = "C:/Program Files/Tesseract-OCR/tesseract.exe"

def convert_pdf_to_images(file_path, scale=300/72):
    pdf_file = pdfium.PdfDocument(file_path)
    page_indices = [i for i in range(len(pdf_file))]
    
    renderer = pdf_file.render(
        pdfium.PdfBitmap.to_pil,
        page_indices=page_indices,
        scale=scale,
    )
    
    list_final_images = []
    
    for i, image in zip(page_indices, renderer):
        image_byte_array = BytesIO()
        image.save(image_byte_array, format='jpeg', optimize=True)
        image_byte_array = image_byte_array.getvalue()
        list_final_images.append({i: image_byte_array})
    
    return list_final_images

def display_images(list_dict_final_images):
    all_images = [list(data.values())[0] for data in list_dict_final_images]

    for index, image_bytes in enumerate(all_images):
        image = Image.open(BytesIO(image_bytes))
        figure = plt.figure(figsize=(image.width / 100, image.height / 100))
        plt.title(f"----- Page Number {index + 1} -----")
        plt.imshow(image)
        plt.axis("off")
        plt.show()

def preprocess_image(image):
    image = image.filter(ImageFilter.MedianFilter())
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2)
    image = image.convert('1')  
    return image

def extract_text_with_pytesseract(list_dict_final_images, output_file_path):
    image_list = [list(data.values())[0] for data in list_dict_final_images]
    image_content = []

    for index, image_bytes in enumerate(image_list):
        image = Image.open(BytesIO(image_bytes))
        image = preprocess_image(image)
        raw_text = pytesseract.image_to_string(image, lang='ben')
        image_content.append(raw_text)

    extracted_text = "\n".join(image_content)
    with open(output_file_path, 'w', encoding='utf-8') as f:
        f.write(extracted_text)

    print(f"Text successfully saved to {output_file_path}")

pdf_file_path = 'D:/code/document.pdf'
output_txt_file = 'D:/code/extracted_text.txt'

convert_pdf_to_images_result = convert_pdf_to_images(pdf_file_path)
display_images(convert_pdf_to_images_result)

extract_text_with_pytesseract(convert_pdf_to_images_result, output_txt_file)
