In [1]:
# import os

# # Specify the directory path
# directory_path = 'Downloaded_PDFs'

# # Get all the PDF files in the directory
# pdf_files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]

# # Sort the files (optional, to maintain consistent order)
# pdf_files.sort()

# # Rename the files with increasing numbers
# for i, filename in enumerate(pdf_files, start=1):
#     new_filename = f"file{i}.pdf"
#     old_file_path = os.path.join(directory_path, filename)
#     new_file_path = os.path.join(directory_path, new_filename)
    
#     # Rename the file
#     os.rename(old_file_path, new_file_path)

# print("Files renamed successfully!")


Files renamed successfully!


In [10]:
import os
import shutil
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
import warnings
from PIL import Image
import io
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Suppress PyPDF2 warnings
logging.getLogger('PyPDF2').setLevel(logging.ERROR)

# Directories
source_dir = "Downloaded_PDFs"
destination_dir = r"not_needed_tender_files\contains_images"

# Minimum size for a "large" image (in pixels)
MIN_LARGE_IMAGE_SIZE = 1000 * 1000  # 1 million pixels (e.g., 1000x1000)

def clean_filenames(directory):
    for filename in os.listdir(directory):
        new_filename = filename.strip()
        if new_filename != filename:
            os.rename(os.path.join(directory, filename), os.path.join(directory, new_filename))
            logger.info(f"Renamed: '{filename}' to '{new_filename}'")

# Create destination folder if it doesn't exist
if not os.path.exists(destination_dir):
    os.makedirs(destination_dir)

def is_image_only_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path, first_page=1, last_page=3)
        return len(images) > 0
    except Exception as e:
        logger.error(f"Error processing {pdf_path}: {e}")
        return False

def contains_large_image(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            for image in page.images:
                img = Image.open(io.BytesIO(image.data))
                if img.width * img.height >= MIN_LARGE_IMAGE_SIZE:
                    return True
        return False
    except Exception as e:
        logger.error(f"Error checking for large images in {pdf_path}: {e}")
        return False

def move_pdf_if_not_needed(pdf_path):
    try:
        if not os.path.exists(pdf_path):
            logger.warning(f"File not found: {pdf_path}")
            return

        reader = PdfReader(pdf_path)
        num_pages = len(reader.pages)
    
        # Condition 1: Move PDFs with fewer than 4 pages
        if num_pages <= 4:
            logger.info(f"Moving {pdf_path} because it has fewer than 4 pages.")
            shutil.move(pdf_path, destination_dir)
            return
    
        # Check for text content
        has_text = False
        for page in reader.pages:
            text = page.extract_text()
            if text and text.strip():
                has_text = True
                break  # Stop as soon as we find text
    
        # Condition 2: If no text is found, move the PDF to the image folder
        if not has_text:
            logger.info(f"Moving {pdf_path} because it contains no text.")
            shutil.move(pdf_path, destination_dir)
            return
    
        # Condition 3: Check for large embedded images
        if contains_large_image(pdf_path):
            logger.info(f"Moving {pdf_path} because it contains a large embedded image.")
            shutil.move(pdf_path, destination_dir)
            return

        logger.info(f"Skipping {pdf_path} as it doesn't meet any moving criteria.")

    except Exception as e:
        logger.error(f"Error processing {pdf_path}: {e}")

def process_pdfs(source_dir):
    total_files = sum(1 for f in os.listdir(source_dir) if f.lower().endswith('.pdf'))
    processed_files = 0

    clean_filenames(source_dir)

    for filename in os.listdir(source_dir):
        if filename.lower().endswith(".pdf"):
            processed_files += 1
            pdf_path = os.path.join(source_dir, filename)
            logger.info(f"Processing file {processed_files}/{total_files}: {filename}")
            move_pdf_if_not_needed(pdf_path)

    logger.info(f"Task completed. Processed {processed_files} PDF files.")

if __name__ == "__main__":
    process_pdfs(source_dir)


2024-10-11 16:03:27,031 - INFO - Processing file 1/1434: file10.pdf
2024-10-11 16:03:27,136 - INFO - Skipping Downloaded_PDFs\file10.pdf as it doesn't meet any moving criteria.
2024-10-11 16:03:27,136 - INFO - Processing file 2/1434: file100.pdf
2024-10-11 16:03:27,162 - INFO - Skipping Downloaded_PDFs\file100.pdf as it doesn't meet any moving criteria.
2024-10-11 16:03:27,163 - INFO - Processing file 3/1434: file1000.pdf
2024-10-11 16:03:27,216 - INFO - Skipping Downloaded_PDFs\file1000.pdf as it doesn't meet any moving criteria.
2024-10-11 16:03:27,217 - INFO - Processing file 4/1434: file1001.pdf
2024-10-11 16:03:27,285 - ERROR - Error checking for large images in Downloaded_PDFs\file1001.pdf: not enough image data
2024-10-11 16:03:27,285 - INFO - Skipping Downloaded_PDFs\file1001.pdf as it doesn't meet any moving criteria.
2024-10-11 16:03:27,286 - INFO - Processing file 5/1434: file1002.pdf
2024-10-11 16:03:27,335 - INFO - Skipping Downloaded_PDFs\file1002.pdf as it doesn't meet a

In [11]:
import os
import re
import shutil
from PyPDF2 import PdfReader

def move_pdf_files(directory_path):
    # Define directories for GeM, Tender, EOI, and Comments Objections documents
    gem_folder = os.path.join(directory_path, "GeM documents")
    tender_folder = os.path.join(directory_path, "Tender documents")
    eoi_folder = os.path.join(directory_path, "EOI documents")
    comments_objections_folder = os.path.join(directory_path, "Comments Objections documents")

    # Create the folders if they don't exist
    os.makedirs(gem_folder, exist_ok=True)
    os.makedirs(tender_folder, exist_ok=True)
    os.makedirs(eoi_folder, exist_ok=True)
    os.makedirs(comments_objections_folder, exist_ok=True)

    # Define the patterns for different document types
    gem_pattern = r'GEM/\d{4}/B/\d+'
    eoi_pattern = r'Expression\s*of\s*Interest\s*'
    comments_objections_pattern = r'Inviting\s*comments\s*thereon'

    # Iterate over all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(directory_path, filename)

            # Read the PDF content
            try:
                reader = PdfReader(file_path)
                pdf_text = ""
                for page in reader.pages:
                    pdf_text += page.extract_text()

                # Search for the patterns in the PDF text and move accordingly
                if re.search(gem_pattern, pdf_text):
                    shutil.move(file_path, os.path.join(gem_folder, filename))
                    print(f"Moved {filename} to GeM documents.")
                elif re.search(eoi_pattern, pdf_text, re.IGNORECASE):
                    shutil.move(file_path, os.path.join(eoi_folder, filename))
                    print(f"Moved {filename} to EOI documents.")
                elif re.search(comments_objections_pattern, pdf_text, re.IGNORECASE):
                    shutil.move(file_path, os.path.join(comments_objections_folder, filename))
                    print(f"Moved {filename} to Comments Objections documents.")
                else:
                    shutil.move(file_path, os.path.join(tender_folder, filename))
                    print(f"Moved {filename} to Tender documents.")

            except Exception as e:
                print(f"Error reading {filename}: {e}")

    # Move Tender documents folder to the specified path
    tender_dest = r"D:\Swapnil's personal space\Research\System And Contextual information integration- Scii\Final Tender Code\Data collection\Step-3 Data_extraction\Tender documents"
    
    # Ensure the destination directory exists
    os.makedirs(os.path.dirname(tender_dest), exist_ok=True)
    
    if os.path.exists(tender_dest):
        shutil.rmtree(tender_dest)
    shutil.move(tender_folder, tender_dest)
    print(f"Moved Tender documents to {tender_dest}")

# Example usage
directory_path = 'Downloaded_PDFs'
move_pdf_files(directory_path)

Moved file10.pdf to Tender documents.
Moved file100.pdf to Tender documents.
Moved file1000.pdf to Tender documents.
Moved file1001.pdf to Tender documents.
Moved file1002.pdf to Tender documents.
Moved file1003.pdf to Tender documents.
Moved file1004.pdf to Tender documents.
Moved file1005.pdf to Tender documents.
Moved file1006.pdf to Tender documents.
Moved file1007.pdf to Tender documents.
Moved file1008.pdf to Tender documents.
Moved file1009.pdf to Tender documents.
Moved file101.pdf to Tender documents.
Moved file1010.pdf to Tender documents.
Moved file1011.pdf to Tender documents.
Moved file1012.pdf to Tender documents.
Moved file1013.pdf to Tender documents.
Moved file1014.pdf to Tender documents.
Moved file1015.pdf to Tender documents.
Moved file1016.pdf to Tender documents.
Moved file1017.pdf to Tender documents.
Moved file1018.pdf to Tender documents.
Moved file1019.pdf to Tender documents.
Moved file102.pdf to Tender documents.
Moved file1021.pdf to Tender documents.
Moved

In [2]:
import os
import shutil
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
import warnings





# Directories
source_dir = "Downloaded_PDFs/Tender documents"
destination_dir = "Downloaded_PDFs/not needed pdfs"

# Clean up filenames in the directory
def clean_filenames(directory):
    for filename in os.listdir(directory):
        new_filename = filename.strip()
        if new_filename != filename:
            os.rename(os.path.join(directory, filename), os.path.join(directory, new_filename))
            print(f"Renamed: '{filename}' to '{new_filename}'")

# Clean filenames in the source directory
clean_filenames(source_dir)


# Create destination folder if it doesn't exist
if not os.path.exists(destination_dir):
    os.makedirs(destination_dir)

def is_image_only_pdf(pdf_path):
    """
    Check if the PDF is image-only by converting pages to images 
    and ensuring no text is extractable.
    """
    try:
        # Convert the first few pages to images to determine if it's image-based
        images = convert_from_path(pdf_path, first_page=1, last_page=3)
        
        # If images are generated, we assume it's image-based
        return len(images) > 0
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return False

def move_pdf_if_not_needed(pdf_path):
    """
    Move PDFs that have fewer than 2 pages or contain no extractable text 
    and are image-based PDFs.
    """
    try:
        # Suppress warnings from PyPDF2 during PDF reading
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            
            # Check if file exists
            if not os.path.exists(pdf_path):
                print(f"File not found: {pdf_path}")
                return

            reader = PdfReader(pdf_path)
            num_pages = len(reader.pages)
        
        # Condition 1: Move PDFs with fewer than 2 pages
        if num_pages <= 4:
            print(f"Moving {pdf_path} because it has fewer than 4 pages.")
            shutil.move(pdf_path, destination_dir)
            return
        
        # Try to extract text from all pages
        has_text = False
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text = page.extract_text()
            
            if text and text.strip():  # If any page has extractable text, flag it
                has_text = True
                break
        
        # Condition 2: If no text is found, check if it's an image-only PDF
        if not has_text:
            if is_image_only_pdf(pdf_path):
                print(f"Moving {pdf_path} because it contains no text and is image-based.")
                shutil.move(pdf_path, destination_dir)

    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")

# Iterate through all PDF files in the source directory
for filename in os.listdir(source_dir):
    if filename.endswith(".pdf"):
        # Strip any leading/trailing spaces from the filename
        filename = filename.strip()

        pdf_path = os.path.join(source_dir, filename)

        move_pdf_if_not_needed(pdf_path)

print("Task completed.")


Moving Downloaded_PDFs/Tender documents\008ext1.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\01TII.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\03VR.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\05NITSEAmmendment01.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\09_NOTICE_571.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\10.2.23_PIEELECTRIC_3206.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\10NITEEECompleteNIT.pdf because it contains no text and is image-based.
Moving Downloaded_PDFs/Tender documents\11NITEEECompleteNIT.pdf because it contains no text and is image-based.
Moving Downloaded_PDFs/Tender documents\13.1.23_NOTICE_2898.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\14.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\15.

Multiple definitions in dictionary at byte 0xb8ddf for key /Info
Multiple definitions in dictionary at byte 0xb8deb for key /Info


Moving Downloaded_PDFs/Tender documents\Airway.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Amendment-1.pdf because it has fewer than 4 pages.


/Prev=0 in the trailer - assuming there is no previous xref table


Moving Downloaded_PDFs/Tender documents\Anatomy _Synthetic.pdf because it contains no text and is image-based.
Moving Downloaded_PDFs/Tender documents\Answers_of_prebid_queries_NIT-198.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Arterial Cannulation.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Auction2.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Auricular ReconstructionSet.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\AutomatedTissue Processor.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\BACTERIAL AND FUNGAL CULTURE _265.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\bloodbank-4-11-22.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Bone Densitometry Unit.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\c0i2emjdNIQ 13 FOR SIGNB

/Prev=0 in the trailer - assuming there is no previous xref table


Moving Downloaded_PDFs/Tender documents\carm (1).pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Central Cannulation.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\CFQ - 40  HMIS.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\CFQ_425 EXTENDE CFQ.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Chartingspecs.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Chemidoc2.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Color Doppler Ultrasound System.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Colour Doppler Ultrasound Machine.pdf because it has fewer than 4 pages.


/Prev=0 in the trailer - assuming there is no previous xref table


Moving Downloaded_PDFs/Tender documents\Continuous RenalReplacement Therapy.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\corii01portablecombi.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\corri01UnweighingSystem.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Corri1ETender_haemoglobinopathy.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\CorriE-T006AutomatedAntimicrobial.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\CorriE-Tender_Medical_Loup.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\corriET_101_Automated_Blood_Culture.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\CorriIET010Flourimeter.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\corrpatient_warming_System.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender

/Prev=0 in the trailer - assuming there is no previous xref table


Moving Downloaded_PDFs/Tender documents\Dual block particle Cyclotron.pdf because it contains no text and is image-based.
Moving Downloaded_PDFs/Tender documents\ECHO.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\ECMOMachine.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\ecorri_03.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\ecorri_04_03.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Electrophysiological System.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\ELISA Reader.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Elisa1.pdf because it has fewer than 4 pages.


/Prev=0 in the trailer - assuming there is no previous xref table


Moving Downloaded_PDFs/Tender documents\Endoscopic Disc Intervention Set.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\ENTWorkstation.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\ENTWorkstation6.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\ENTWorkstations.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\EP.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Eprocure.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Eprocure_airwayscopesystem.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\EPS.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\EPS1.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\erm8btxcderma.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\ET007NarrowBandCorr.pdf beca

incorrect startxref pointer(1)


Moving Downloaded_PDFs/Tender documents\High Definition Videobronchoscope.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\High.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\HighVelocityTherapyCPPP.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\HPLC 25_08.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\HPLC System.pdf because it has fewer than 4 pages.


/Prev=0 in the trailer - assuming there is no previous xref table


Moving Downloaded_PDFs/Tender documents\i67dbmaiNIQ FOR MUSEUM JARS.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Immunochemistry Analyzer2022.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Integrateddifficult1.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Intimation_letter_for_ET-205.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Intra Operative NerveMonitoring.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\IntraOperating Nerve Monitoring.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Invasive.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Invivoimaging.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\IR22CPP.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\it requirements.pdf because it has fewer than 4

/Prev=0 in the trailer - assuming there is no previous xref table


Moving Downloaded_PDFs/Tender documents\MBDV.pdf because it contains no text and is image-based.
Moving Downloaded_PDFs/Tender documents\Micro.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Micros.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Microscope (1).pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Microscope-1.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Microscope-CDER.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Microscope1.pdf because it contains no text and is image-based.
Moving Downloaded_PDFs/Tender documents\mobile.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Mohs Micrographic Surgery Setup.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\MSK USG Etender.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\MSK2.pdf becaus

Multiple definitions in dictionary at byte 0x1a028a for key /Info
Multiple definitions in dictionary at byte 0x1a0297 for key /Info
Multiple definitions in dictionary at byte 0x1a02a4 for key /Info


Moving Downloaded_PDFs/Tender documents\NIT12.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\NIT13.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\NIT134.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\NIT14.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\NIT140.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\NIT15.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\NIT152uplaoded.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\NIT156.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\NIT158.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\NIT162.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\NIT17.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\NIT174.pdf 

Multiple definitions in dictionary at byte 0x1811ad for key /Info
Multiple definitions in dictionary at byte 0x1811ba for key /Info
Multiple definitions in dictionary at byte 0x1811c7 for key /Info


Moving Downloaded_PDFs/Tender documents\NIT86.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\NIT88.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\NITBIPLANE.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\NOC-1052.pdf because it contains no text and is image-based.
Moving Downloaded_PDFs/Tender documents\NOC-1068.pdf because it contains no text and is image-based.
Moving Downloaded_PDFs/Tender documents\NOC-1758.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\NOC-1995.pdf because it contains no text and is image-based.
Moving Downloaded_PDFs/Tender documents\NOC-2045.pdf because it contains no text and is image-based.
Moving Downloaded_PDFs/Tender documents\NOC-2058.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\NOC-2130.pdf because it contains no text and is image-based.
Moving Downloaded_PDFs/Tender documents\NOC-2180.pdf becaus

/Prev=0 in the trailer - assuming there is no previous xref table


Moving Downloaded_PDFs/Tender documents\PCR.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\PCR1.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Peripheral Cannulation.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Plasmapheresis Machine Portable.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\POCT.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Portable Ultrasound Machine.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Portable Ultrasound System.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\PortableMobile Endoscopy.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\pre249.pdf because it has fewer than 4 pages.


/Prev=0 in the trailer - assuming there is no previous xref table


Moving Downloaded_PDFs/Tender documents\PreBidclarification_ET1081.pdf because it contains no text and is image-based.
Moving Downloaded_PDFs/Tender documents\prebid_160.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Pre_-_Bid_Clarification_-_ET_-124.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Pulmonary.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Purchase of Samsung TV.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Qutation398.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Real Time PCR.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\regenerate.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\reviesed_pre_bid_ET-209.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Rigid Bronchoscope.pdf because it has fewer than 4 pages.
Moving Downl

/Prev=0 in the trailer - assuming there is no previous xref table
/Prev=0 in the trailer - assuming there is no previous xref table


Moving Downloaded_PDFs/Tender documents\scan0004.pdf because it contains no text and is image-based.
Moving Downloaded_PDFs/Tender documents\Scan_20230513.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Scan_20230725.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\ScrapNoticeReamingSystem.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Scrap_ET68.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Scrap_Notice-183.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Slidescanner.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Speed Vac Concentrator.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\SpeedVac Concentrator.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Storagetank12.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender docum

Multiple definitions in dictionary at byte 0x80557 for key /Info
Multiple definitions in dictionary at byte 0x80564 for key /Info
Multiple definitions in dictionary at byte 0x80571 for key /Info


Moving Downloaded_PDFs/Tender documents\Surgicalloup1.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Surgicalm.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Tab. Voxelotor.pdf because it contains no text and is image-based.
Moving Downloaded_PDFs/Tender documents\Tally_package_apr-21.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\TBOCorriET037EYBANK.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\TeachingMaterial.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Tender-Notice.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Tender-WGSM.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\TenderScrapped_Ultrasonic.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Therapeutic Videobronchoscope.pdf because it has fewer than 4 pages.
Moving Downloaded_

/Prev=0 in the trailer - assuming there is no previous xref table
/Prev=0 in the trailer - assuming there is no previous xref table
/Prev=0 in the trailer - assuming there is no previous xref table
Multiple definitions in dictionary at byte 0xfd66e for key /Info
Multiple definitions in dictionary at byte 0xfd67b for key /Info
Multiple definitions in dictionary at byte 0xfd688 for key /Info


Moving Downloaded_PDFs/Tender documents\tmed_1.pdf because it contains no text and is image-based.
Moving Downloaded_PDFs/Tender documents\Trans.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Transcutaneous.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Transeso.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Transesophageal.pdf because it has fewer than 4 pages.


incorrect startxref pointer(3)
/Prev=0 in the trailer - assuming there is no previous xref table
/Prev=0 in the trailer - assuming there is no previous xref table


Moving Downloaded_PDFs/Tender documents\ujbuyde3NIQ for Procurement of PRI.pdf because it contains no text and is image-based.
Moving Downloaded_PDFs/Tender documents\Ultra centrifuge.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Ultracentrifuge.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Ultrasonic Aspirator System.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Ultrasound Machine.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\VAP care.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\VAP1.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Venous Coupler System.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Wide Band Tympanometer Tympanometer.pdf because it has fewer than 4 pages.
Moving Downloaded_PDFs/Tender documents\Wireless FNIRS System.pdf because it has fewer th

In [3]:

# Ensure keywords are treated as whole words with word boundaries
keywords_dict = {
    'Consumables':             ['consumables', 'consumable','consumbale' 'Consuamable', 'batteries', 'battery', 'Chemical', 'Chemicals', 'reagent',  'drug', 
                                'Milk', 'antibiotic', 'bottle', 'bottle', 'Sodalime', 'Echosheild', 'kit', 'QFT Plus Tubes', 'Reagent Kits', 'Serology Kits', 
                                'SUPPLY OF MEDICINE', 'fluids', 'ethanol', 'hand Rub', 'Hand Sanitizer','dopa'],

    'Hardware/Equipment':      ['wiring', 'LED', 'mechanical', 'Electrical Fittings', 'Plumbing', 'Glassware', 'surgical instruments', 'furniture', 'saw',
                                'drill', 'TRIMMER', 'Mowers', 'Pruner', 'cutter', 'Energy Meter', 'Light', 'cable', 'Electrical', 'Electronic', 
                                'line connection', 'power point', 'Sprayer', 'vacuum', 'Instrument', 'freezer', 'Electrophoresis', 'NEPHELOMETERNEPHELOMETER',
                                'PIEZO SURGERY MODULE', 'Refilling', 'refrigerated', 'Hot air oven', 'cylinder', 'cylinders', 'Mechanised', 'ro', 'Electric',
                                'Instrument', 'CRYOSPRAY', 'Warmer', 'plate', 'lamp', 'Refrigerator', 'Air Conditioner', 'Air Conditioners', 
                                'Air Conditioning', 'Air Condiction', 'Fire', 'weighing scale', 'UPS', 'Calibrators', 'cooling tower', 'wires', 
                                'Water Cooler', 'Water Coolers', 'water tanker', 'water tankers', 'sprinklers', 'Water Treatment', 'water supply line',
                                'water purification', 'chiller', 'water purifier', 'Deionized Water','storboscope','Oven','Forceps','cpm','sg','keratometer','autostainer'],

    'Medical Supplies':        ['Catheter', 'surgical items', 'Hospital Store', 'Bandage', 'Disinfectant', 'syringe', 'Orthopedic Implant', 'cannula', 
                                'Abdominal Drainage Kit', 'cataract', 'Spine', 'Phaco Practice', 'Collinear Reduction Clamp', 'Gown', 'BIOSAFETY LEVEL',
                                'Orthopaedics', 'Human Skeleton', 'Leyla Retractor System for Neurosurgeries', 'Panchkarma Material', 'Tourniquet Set', 
                                'Urine', 'Bookwalter Retractor', 'Surgical Items', 'disposable', 'Sterilization', 'Post Mortem Kit', 'Patient Positioner',
                                'Patient Positioning', 'Panchakrma', 'Glove', 'Mask', 'Drainages', 'Stretcher', 'Bed', 'Related Material', 'Intrauterine Insemination',
                                'Fletcher Suite', 'Anterior Segment', 'antibody' ,'Trugnom', 'Biomlife', 'Kaya Cervi Machine','mannikin','ambu mark','telescope','microtome','cryostat'],

    'General Supplies':        ['stationery', 'Stationery', 'cleaning', 'Cleaning', 'bags', 'bag', 'containers', 'dustbin', 'BMW Bins', 'general hospital item', 
                                'mannequins', 'Manikins', 'blankets', 'blanket', 'jars', 'Staplers', 'reusable', 'injection', 'linen', 'Cartridge', 'Trays',
                                'Gloves', 'Mask', 'pad', 'pen', 'sticker', 'envelope', 'files', 'Slipper', 'Gum Boot', 'pillow', 'Foot Step', 'tape', 'Mosquito Net','kit'],

    'Facilities and Utilities': ['Air Conditioners', 'cooling tower', 'Water Cooler', 'Water Coolers', 'vacuum', 'maintenance', 'lift', 'water supply line', 'Water Treatment',
                                'water purification', 'fire extinguisher', 'ECRP-II', 'Guest House', 'Roller Blinds', 'garden', 'storage', 'tank', 'nitrogen', 'liquid', 
                                'Signboards', 'lecture', 'hall', 'flooring', 'ambulance', 'Buses', 'CRECHE', 'Establishment', 'bus', 'Stadium', 'GYM', 'Disposal', 'waste',
                                'Shelter', 'blinds', 'cubicals', 'residential', 'terrace', 'floor', 'door', 'Drainage', 'painting', 'drains', 'Toilet', 'sewer', 'Ceiling',
                                'duct insulation', 'drain', 'earthing', 'strip', 'Waterproofing', 'drinking water', 'watering', 'supply line','vehicle'],

    'Services':                 ['photography', 'Videography', 'renting out', 'Cancellation', 'Trainers', 'manpower', 'Cab', 'taxi', 'sanitation', 'parking',
                                  'ADVERTISEMENT','outreach', 'Outsourcing', 'opd', 'Labour', 'skill lab', 'laundry', 'catering', 'Housekeeping', 'work', 'workers', 
                                  'Trainer', 'Insurance', 'PG', 'hiring', 'Exam', 'maintenance', 'security services', 'Examination', 'Photocopying', 'Binding', 
                                  'Lamination'],

    'Electrical Items':         ['Electrical', 'Electronic', 'line connection', 'power point', 'fan', 'UPS', 'gas'],

    'Office Supplies':          ['manual', 'Printing', 'stationary', 'Stationery', 'files', 'envelope', 'pad', 'sticker', 'book', 'ID cards'],

    'Medical Equipment':        ['Instruments', 'Electrophoresis', 'NEPHELOMETERNEPHELOMETER', 'PIEZO SURGERY MODULE', 'Human Skeleton', 
                                 'Leyla Retractor System for Neurosurgeries', 'weighing scale', 'Bookwalter Retractor', 'mannequin','card' ,'Manikin', 
                                 'CRYOSPRAY', 'Warmer', 'plate', 'Post Mortem Kit', 'Patient Positioner', 'COUCH', 'Taap Swedan Yantra'],
    'Personal Protective Equipment': ['Gloves', 'Mask', 'hand Rub', 'Hand Sanitizer', 'Gown'],

    'Furniture':                ['chair', 'Table', 'Cupboard', 'ALMIRAH', 'Stool', 'Furnishing', 'Bed', 'COUCH', 'Rack', 'Book Shelf','cabinet'],

    'Cleaning Supplies':        ['Cleaning', 'scrub station', 'Disposal', 'waste', 'dustbin', 'BMW Bins', 'Disinfectant', 'Sanitary', 'House keeping'],

    'Construction Materials': ['brick', 'bricks', 'wooden', 'plastic', 'cardboard', 'metal', 'steel', 'aluminium panel', 'interlocking tiles', 
                               'Aluminum grill'],

    'Food Supplies':            ['kitchen', 'Shampoo', 'Milk', 'Canteen', 'mess', '5cy2sri3Mess', 'food', 'cafeteria','dinner','lunch','seating','cook'],

    'Laboratory Supplies':      ['Chemicals', 'reagents', 'Glassware', 'Electrophoresis', 'NEPHELOMETERNEPHELOMETER', 'QFT Plus Tubes', 'Serology Kits',
                                  'Reagent Kits'],
                                  
    'Pharmacy':                 ['pharmacy', 'Janaushadhi Kendra', 'drug', 'antibiotic'],

    'Dental Supplies':          ['Dental Chair', 'dental instruments'],

    'Veterinary Supplies':      ['Animal House'],

    'Miscellaneous':            ['Video', 'Adhesive', 'Pest', 'IS 2631', 'household', 'Welder', 'construction', 'film', 'bod', 'signage', 'firm', 'Centre', 
                                 'GEM', 'curtains', 'napkin', 'napikins', 'civil work', 'wall', 'height', 'quotation', 'jute', 'flower', 'pot', 'trolley', 
                                 'miscellaneous',  'Corriqendum','CORRIGENDUM', 'Bike', 'exercise', 'Playing', 'curtain', 'condemned', 'cough', 'educational', 'psychological scale',
                                 'bath', 'WaterLock', 'water to', 'fixing','vehicle', 'Dressing', 'media', 'empanelment', ' Janaushadhi Kendra', 'diaper','solution','alcohol', 
                                 'beds', 'Lenin', 'Skelton', 'Sink', 'barber','shop',  'Printed Forms/Cards/Register', 'civil', 'Repair', 'cart', 'polymer','consumablesconsumables'
                                'Repairing','comment','pipe','spare parts','washbasin','Renovation','renovation','Beautification','scrap','decoration','NOC','diesel','dg set','rolls',
                                'vessel','gem','freezing','freeze','stapler','needle','scalpel','register','container','plant','cpr','filter','dvt','calibrator','flask','	solutioniodine',
                                'spare','laser','booklet','connector tube','clip','small','sweeping','mini','knee','incubator','roof','water','virus','board','aluminium','partition',
                                'hemocue', 'cuvettes','cold', 'soda', ' lime', 'canister', 'play', 'ground', 'item','workbench','newspaper','disinfection','objection','flag','dharmasala',
                                'lighting','bipolar','accessory','ice']
}

In [4]:
import os
import shutil
import re

def move_files_with_keywords(source_dir, destination_dir, keywords_dict):
    # Ensure the destination directory exists
    os.makedirs(destination_dir, exist_ok=True)

    # Compile the regex patterns for each category
    keyword_patterns = {}
    for category, keywords in keywords_dict.items():
        # Convert all keywords to lowercase for case-insensitive matching
        keywords = [re.escape(keyword.lower()) for keyword in keywords]
        # Create a regex pattern that ensures whole word matching using word boundaries
        pattern = r'\b(' + '|'.join(keywords) + r')\b'
        keyword_patterns[category] = re.compile(pattern, re.IGNORECASE)

    # Iterate through all files in the source directory
    for filename in os.listdir(source_dir):
        # Convert filename to lowercase for case-insensitive matching
        lower_filename = filename.lower()

        # Check each category for matching keywords in the filename
        for category, pattern in keyword_patterns.items():
            if pattern.search(lower_filename):
                source_path = os.path.join(source_dir, filename)
                destination_category_dir = os.path.join(destination_dir, category)

                # Ensure category-specific folder exists
                os.makedirs(destination_category_dir, exist_ok=True)

                destination_path = os.path.join(destination_category_dir, filename)
                
                # Try to move the file and handle exceptions
                try:
                    shutil.move(source_path, destination_path)
                    print(f"Moved: {filename} to {category} folder")
                    break  # Exit loop once the file has been moved to the appropriate category
                except FileNotFoundError:
                    print(f"File not found, skipping: {filename}")
                except Exception as e:
                    print(f"Error moving {filename}: {e}")

# Define the source and destination directories
source_dir = "Tender documents"
destination_dir = "not needed pdfs/test"



# Call the function to move the files
move_files_with_keywords(source_dir, destination_dir, keywords_dict)

print("File moving process completed.")


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'Tender documents'