In [2]:
import os
import pdfplumber
import pytesseract
from PIL import Image

def pdf_to_txt(pdf_folder, txt_folder):
    """
    Converts all PDF files in a folder to TXT files, using OCR if necessary.

    Args:
      pdf_folder: The path to the folder containing PDF files.
      txt_folder: The path to the folder where TXT files will be saved.
    """

    if not os.path.exists(txt_folder):
        os.makedirs(txt_folder)

    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            txt_path = os.path.join(txt_folder, filename[:-4] + ".txt")

            try:
                # Attempt to extract text directly
                with pdfplumber.open(pdf_path) as pdf:
                    text = ""
                    for page in pdf.pages:
                        text += page.extract_text()
            except:
                # If direct extraction fails, use OCR
                text = ""
                images = convert_from_path(pdf_path) 
                for i in range(len(images)):
                    page = images[i]
                    text += pytesseract.image_to_string(page)

            with open(txt_path, "w", encoding="utf-8") as f:
                f.write(text)

if __name__ == "__main__":
    pdf_folder = input("Enter the path to the PDF folder: ")
    txt_folder = input("Enter the path to the TXT folder: ")
    pdf_to_txt(pdf_folder, txt_folder)
    print("Conversion complete!")

Conversion complete!
