# ONE PATIENT PDF WITH MANY TABLES, EACH TABLE IN A DIFERENT WORKSHEET

In [10]:
import pdfplumber
import pandas as pd
import os

# Get the directory where the script is located
script_dir = os.path.dirname(os.path.abspath("E:\EEG_notebook\EEG_pdf_files_to_Excel\PROTOCOL.PYTHON.EXCEL.pdf"))

# Specify the input PDF file (replace with your actual file path)
pdf_file = os.path.join(script_dir, "E:\EEG_notebook\EEG_pdf_files_to_Excel\PROTOCOL.PYTHON.EXCEL.pdf")

# Extract the filename without extension for the output Excel file
pdf_filename = os.path.splitext(pdf_file)[0]
excel_file = os.path.join(script_dir, pdf_filename + ".xlsx")

# Function to clean strings (remove leading/trailing spaces)
def clean_string(s):
    if isinstance(s, str):
        return s.strip()  
    return s

# Function to check and remove duplicate titles from tables
def remove_title_from_table(title, table):
    if len(table) > 0 and isinstance(table[0][0], str) and title == table[0][0].strip():
        return table[1:]  
    return table

# List to store extracted tables and their titles
all_tables = []

with pdfplumber.open(pdf_file) as pdf:
    for page_num, page in enumerate(pdf.pages, start=1): 
        page_text = page.extract_text()
        lines = page_text.split("\n")

        tables = page.extract_tables()
        for table in tables:
            if not table or len(table[0]) < 2:
                continue 

            # Find the table title (line above the table)
            title = None
            for line in lines:
                if line.strip():
                    title = clean_string(line)
                    break

            # Remove title from table if it's included as the first row
            table = remove_title_from_table(title, table)

            # Create pandas DataFrame
            df = pd.DataFrame(table)

            # Set the first row as the header
            df.columns = df.iloc[0]
            df = df[1:]

            # Clean data by removing leading/trailing spaces from cells
            df = df.map(lambda x: clean_string(x) if isinstance(x, str) else x)

            # Add table and title to the list
            all_tables.append((title, df))

# Save tables to Excel
with pd.ExcelWriter(excel_file, engine="openpyxl") as writer:
    for i, (title, table) in enumerate(all_tables):
        sheet_name = f"Table_{i+1}"
        table.to_excel(writer, sheet_name=sheet_name, index=False, startrow=1) 

        # Add table title to the first row of the sheet
        worksheet = writer.sheets[sheet_name]
        worksheet.cell(row=1, column=1).value = title

print(f"Tables and titles extracted and saved to '{excel_file}'.")

Tables and titles extracted and saved to 'E:\EEG_notebook\EEG_pdf_files_to_Excel\PROTOCOL.PYTHON.EXCEL.xlsx'.


In [2]:
import sys
print(f"Current environment: {sys.prefix}") 

Current environment: C:\Users\paulo\miniconda3\envs\qeeg397
