In [17]:
import os
import shutil
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
import warnings
from PIL import Image
import io
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Suppress PyPDF2 warnings
logging.getLogger('PyPDF2').setLevel(logging.ERROR)

# Directories
source_dir = "Downloaded_PDFs"
destination_dir = r"not_needed_tender_files\contains_images"

# Minimum size for a "large" image (in pixels)
MIN_LARGE_IMAGE_SIZE = 1000 * 1000  # 1 million pixels (e.g., 1000x1000)

def clean_filenames(directory):
    for filename in os.listdir(directory):
        new_filename = filename.strip()
        if new_filename != filename:
            os.rename(os.path.join(directory, filename), os.path.join(directory, new_filename))
            logger.info(f"Renamed: '{filename}' to '{new_filename}'")

# Create destination folder if it doesn't exist
if not os.path.exists(destination_dir):
    os.makedirs(destination_dir)

def is_image_only_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path, first_page=1, last_page=3)
        return len(images) > 0
    except Exception as e:
        logger.error(f"Error processing {pdf_path}: {e}")
        return False

def contains_large_image(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            for image in page.images:
                img = Image.open(io.BytesIO(image.data))
                if img.width * img.height >= MIN_LARGE_IMAGE_SIZE:
                    return True
        return False
    except Exception as e:
        logger.error(f"Error checking for large images in {pdf_path}: {e}")
        return False

def move_pdf_if_not_needed(pdf_path):
    try:
        if not os.path.exists(pdf_path):
            logger.warning(f"File not found: {pdf_path}")
            return

        reader = PdfReader(pdf_path)
        num_pages = len(reader.pages)
    
        # Condition 1: Move PDFs with fewer than 4 pages
        if num_pages <= 4:
            logger.info(f"Moving {pdf_path} because it has fewer than 4 pages.")
            shutil.move(pdf_path, destination_dir)
            return
    
        # Check for text content
        has_text = any(page.extract_text().strip() for page in reader.pages)
    
        # Condition 2: If no text is found, check if it's an image-only PDF
        if not has_text and is_image_only_pdf(pdf_path):
            logger.info(f"Moving {pdf_path} because it contains no text and is image-based.")
            shutil.move(pdf_path, destination_dir)
            return
    
        # Condition 3: Check for large embedded images
        if contains_large_image(pdf_path):
            logger.info(f"Moving {pdf_path} because it contains a large embedded image.")
            shutil.move(pdf_path, destination_dir)
            return

        logger.info(f"Skipping {pdf_path} as it doesn't meet any moving criteria.")

    except Exception as e:
        logger.error(f"Error processing {pdf_path}: {e}")

def process_pdfs(source_dir):
    total_files = sum(1 for f in os.listdir(source_dir) if f.lower().endswith('.pdf'))
    processed_files = 0

    clean_filenames(source_dir)

    for filename in os.listdir(source_dir):
        if filename.lower().endswith(".pdf"):
            processed_files += 1
            pdf_path = os.path.join(source_dir, filename)
            logger.info(f"Processing file {processed_files}/{total_files}: {filename}")
            move_pdf_if_not_needed(pdf_path)

    logger.info(f"Task completed. Processed {processed_files} PDF files.")

if __name__ == "__main__":
    process_pdfs(source_dir)

2024-10-11 12:17:54,997 - INFO - Processing file 1/1786: file10.pdf
2024-10-11 12:17:55,028 - INFO - Skipping Downloaded_PDFs\file10.pdf as it doesn't meet any moving criteria.
2024-10-11 12:17:55,030 - INFO - Processing file 2/1786: file100.pdf
2024-10-11 12:17:55,080 - INFO - Skipping Downloaded_PDFs\file100.pdf as it doesn't meet any moving criteria.
2024-10-11 12:17:55,080 - INFO - Processing file 3/1786: file1000.pdf
2024-10-11 12:17:55,135 - INFO - Skipping Downloaded_PDFs\file1000.pdf as it doesn't meet any moving criteria.
2024-10-11 12:17:55,135 - INFO - Processing file 4/1786: file1001.pdf
2024-10-11 12:17:55,289 - ERROR - Error checking for large images in Downloaded_PDFs\file1001.pdf: not enough image data
2024-10-11 12:17:55,290 - INFO - Skipping Downloaded_PDFs\file1001.pdf as it doesn't meet any moving criteria.
2024-10-11 12:17:55,290 - INFO - Processing file 5/1786: file1002.pdf
2024-10-11 12:17:55,351 - INFO - Skipping Downloaded_PDFs\file1002.pdf as it doesn't meet a