In [None]:
import os
import pdfplumber
import pytesseract
from PIL import Image
from pdf2image import convert_from_path

pytesseract.pytesseract.tesseract_cmd = r"D:\Program Files\Tesseract OCR\tesseract.exe"

def pdf_to_txt(pdf_folder, txt_folder):
    """
    Converts all PDF files in a folder to TXT files, using OCR if necessary.
    """

    if not os.path.exists(txt_folder):
        os.makedirs(txt_folder)

    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            txt_path = os.path.join(txt_folder, filename[:-4] + ".txt")
            img_folder = os.path.join(txt_folder, filename[:-4] + "_images") 

            try:
                # Attempt to extract text directly
                with pdfplumber.open(pdf_path) as pdf:
                    text = ""
                    for page in pdf.pages:
                        text += page.extract_text()
            except:
                # If direct extraction fails, use OCR
                text = ""
                
                # Convert PDF to images and save them for inspection
                if not os.path.exists(img_folder):
                    os.makedirs(img_folder)
                images = convert_from_path(pdf_path, output_folder=img_folder, fmt="png")  

                for i, page in enumerate(images):
                    # Save each image with a sequential number
                    image_path = os.path.join(img_folder, f"page_{i+1}.png") 
                    
                    # Try different PSM values
                    for psm in range(6, 14):  # Try PSM modes 6 to 13
                        try:
                            text = pytesseract.image_to_string(page, config=f"--psm {psm}")
                            if text.strip():  # Check if any text was extracted
                                print(f"PSM {psm} succeeded for {filename}")
                                break  # Stop trying other PSM values if successful
                        except Exception as e:
                            print(f"Error with PSM {psm} for {filename}: {e}")
                    else:
                        # If no PSM worked, print an error message
                        print(f"OCR failed for {filename}, image saved to {image_path}")

            with open(txt_path, "w", encoding="utf-8") as f:
                f.write(text)

if __name__ == "__main__":
    pdf_folder = r"D:\New Folder"  # Or use one of the other path fixes
    txt_folder = r"D:\New Folder 2"  # Or use one of the other path fixes
    pdf_to_txt(pdf_folder, txt_folder)
    print("Conversion complete!")

Conversion complete!
