In [3]:
import pdfplumber
import json

def extract_information_and_tables(pdf_path: str) -> dict:
    """Extract both text and table data from each page of the PDF document."""
    extracted_data = {
        "text": [],  # Store extracted text
        "tables": []  # Store extracted tables as lists of dictionaries
    }
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text from the page
            text = page.extract_text()
            extracted_data["text"].append(text)
            
            # Extract table data from the page
            tables = []
            for table in page.extract_tables():
                table_data = []
                for row in table:
                    table_data.append({f"col_{i}": cell for i, cell in enumerate(row)})
                tables.append(table_data)
            extracted_data["tables"].append(tables)
                
    return extracted_data

def save_extracted_data(data: dict, output_file: str):
    """Save the extracted data to a JSON file."""
    with open(output_file, 'w') as json_file:
        json.dump(data, json_file, indent=4)

def main():
    pdf_path = 'input.pdf'  # Specify the path to the input PDF file
    output_file = 'output.json'  # Specify the path to the output JSON file
    
    # Extract data from PDF
    extracted_data = extract_information_and_tables(pdf_path)
    
    # Save the extracted data
    save_extracted_data(extracted_data, output_file)
    
    print(f"Extracted data has been saved to {output_file}")

if __name__ == "__main__":
    main()


Extracted data has been saved to output.json


In [2]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.0-py3-none-any.whl.metadata (39 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.29.0-py3-none-win_amd64.whl.metadata (48 kB)
     ---------------------------------------- 0.0/48.5 kB ? eta -:--:--
     -------------------------------- ----- 41.0/48.5 kB 991.0 kB/s eta 0:00:01
     -------------------------------------- 48.5/48.5 kB 814.3 kB/s eta 0:00:00
Downloading pdfplumber-0.11.0-py3-none-any.whl (56 kB)
   ---------------------------------------- 0.0/56.4 kB ? eta -:--:--
   ------------------------------------ --- 51.2/56.4 kB 1.3 MB/s eta 0:00:01
   ---------------------------------------- 56.4/56.4 kB 981.5 kB/s eta 0:00:00
Downloading pypdfium2-4.29.0-py3-none-win_amd64.whl (2.9 MB)
   ---------------------------------------- 0.0/2.9 MB ? eta -:--:--
   - -------------------------------------- 0.1/2.9 MB 2.3 MB/s eta 0:00:02
   -- ------------------------------------- 0.2/2.9 MB 2.4 MB/s eta 0:00: