In [1]:
import os
import re
import json
import pdfplumber

# Define a function to extract text and tables from a single PDF
def extract_pdf_content(file_path):
    content = ""

    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            # Extract the raw text from the PDF page
            text = page.extract_text()
            if text:
                # Remove Devanagari script using regex (if needed)
                text = re.sub(r'[\u0900-\u097F]+', '', text)

                # Remove patterns like (cid:7414) along with the parentheses (common in PDFs)
                text = re.sub(r'\s*\(cid:\d+\)\s*', '', text)

                # Optionally, remove extra spaces left after cleaning
                text = re.sub(r'\s+', ' ', text).strip()

                # Append cleaned text to content
                content += text

            # Extract tables (if any) and append them in plain text format
            tables = page.extract_tables()
            if tables:
                for table in tables:
                    for row in table:
                        # Handle NoneType by converting None to empty string
                        sanitized_row = [str(cell) if cell is not None else '' for cell in row]
                        # Join table rows as a string and append to content (no extra markers)
                        content += ' | '.join(sanitized_row) + '\n'

    return content

# Function to process all PDFs in the directory and save as JSON
def process_pdfs(directory):
    all_data = []

    for file_name in os.listdir(directory):
        if file_name.endswith('.pdf'):
            file_path = os.path.join(directory, file_name)
            print(f"Processing {file_name}...")
            
            # Extract content from each PDF
            pdf_content = extract_pdf_content(file_path)
            
            # Store content and file name in a structured format
            all_data.append({
                'file_name': file_name,
                'content': pdf_content
            })

    return all_data

# Function to save extracted data to a JSON file
def save_to_json(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

# Main processing function
def main():
    directory = r"Tender documents"  # Specify your directory containing PDFs
    output_file = '..\..\Json_pdf_title_extraction\Extraction\extracted_pdf_data.json'  # Output JSON file
    
    # Process all PDFs in the specified directory
    pdf_data = process_pdfs(directory)
    
    # Save the extracted content to a JSON file
    save_to_json(pdf_data, output_file)

    print(f"Extraction completed. Data saved to {output_file}")

# Run the main process
main()


Processing file10.pdf...
Processing file100.pdf...
Processing file1000.pdf...
Processing file1001.pdf...
Processing file1002.pdf...
Processing file1003.pdf...
Processing file1004.pdf...
Processing file1005.pdf...
Processing file1006.pdf...
Processing file1007.pdf...
Processing file1008.pdf...
Processing file1009.pdf...
Processing file101.pdf...
Processing file1010.pdf...
Processing file1011.pdf...
Processing file1012.pdf...
Processing file1013.pdf...
Processing file1014.pdf...
Processing file1015.pdf...
Processing file1016.pdf...
Processing file1017.pdf...
Processing file1018.pdf...
Processing file1019.pdf...
Processing file102.pdf...
Processing file1021.pdf...
Processing file1022.pdf...
Processing file1023.pdf...
Processing file1024.pdf...
Processing file1025.pdf...
Processing file1026.pdf...
Processing file1027.pdf...
Processing file1028.pdf...
Processing file1029.pdf...
Processing file1030.pdf...
Processing file1031.pdf...
Processing file1032.pdf...
Processing file1033.pdf...
Proces