# ALL TABLES IN ONE WORKSHEET, TITLE AND HEAD ARE SEPARATED ONE LINE FROM TABLE

Dynamic PDF and Excel Paths – The script asks for the PDF path and the desired Excel filename.
Excel Creation in Same Folder – The Excel file is created in the same directory as the PDF.
Custom Excel Naming – The user can specify the Excel file name without hardcoding it.

In ancient Egypt, epileptic seizures were seen as visits from the gods – a mystical experience that connected the afflicted to the divine. However, during the Middle Ages, the same seizures could lead to far more dangerous consequences. Epilepsy was often misunderstood and associated with demonic possession, putting those who suffered from it at risk of being condemned and even burned at the stake.

Amidst this dark period, Saint Valentine emerged as the patron saint of epileptics, offering hope and spiritual protection to those battling the condition. His legacy stands as a reminder of compassion and the importance of understanding medical conditions beyond superstition.

I don't believe in witches or demons, but I do believe you'll hit the like button and subscribe to the channel.

In [2]:
import pdfplumber
import pandas as pd
import os
import tkinter as tk
from tkinter import filedialog, simpledialog

# Ask user to select a folder containing PDFs
root = tk.Tk()
root.withdraw()  # Hide the main window
folder_path = filedialog.askdirectory(title="Select Folder with PDF Files")

if not folder_path:
    print("No folder selected. Exiting.")
    exit()

# Ask for Excel file name
excel_file_name = simpledialog.askstring("Input", "Enter a name for the Excel file (without extension):")
if not excel_file_name:
    print("No file name provided. Exiting.")
    exit()

excel_file = os.path.join(folder_path, excel_file_name + ".xlsx")

# Function to clean strings (remove leading/trailing spaces)
def clean_string(s):
    if isinstance(s, str):
        return s.strip()
    return s

# Function to check and remove duplicate titles from tables
def remove_title_from_table(title, table):
    if len(table) > 0 and isinstance(table[0][0], str) and title == table[0][0].strip():
        return table[1:]  # Remove first row if it matches the title
    return table

# Prepare to write to a single Excel file
with pd.ExcelWriter(excel_file, engine="openpyxl") as writer:
    # Flag to track if any data was written
    data_written = False

    # Iterate over all PDF files in the selected folder
    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith(".pdf"):
            pdf_file = os.path.join(folder_path, file_name)
            pdf_filename = os.path.splitext(file_name)[0]

            # List to store DataFrames to append side by side
            table_list = []

            # Open and process each PDF
            with pdfplumber.open(pdf_file) as pdf:
                for page_num, page in enumerate(pdf.pages, start=1):
                    page_text = page.extract_text()
                    if not page_text:
                        continue  # Skip if no text on the page

                    lines = page_text.split("\n")
                    tables = page.extract_tables()

                    for table in tables:
                        if not table or len(table[0]) < 2:
                            continue  # Skip tables with less than 2 columns

                        # Find the table title (line above the table)
                        title = None
                        for line in lines:
                            if line.strip():
                                title = clean_string(line)
                                break

                        # Remove title from table if it's included as the first row
                        table = remove_title_from_table(title, table)

                        # Create pandas DataFrame from the table
                        df = pd.DataFrame(table)

                        # Set the first row as the header
                        df.columns = df.iloc[0]
                        df = df[1:]

                        # Clean data by removing leading/trailing spaces from cells
                        df = df.map(lambda x: clean_string(x) if isinstance(x, str) else x)

                        # Ensure table name aligns with the table width
                        table_name_df = pd.DataFrame([[title] + [None] * (len(df.columns) - 1)], columns=df.columns)
                        header_df = pd.DataFrame([df.columns.tolist()], columns=df.columns)  # Create header
                        full_table = pd.concat([table_name_df, header_df, df], ignore_index=True)

                        # Append with spacing (2 columns between tables)
                        if table_list:
                            spacer = pd.DataFrame(columns=[None] * 2)
                            table_list.append(spacer)
                        table_list.append(full_table)

            # Concatenate tables side by side with spacing
            if table_list:
                combined_df = pd.concat(table_list, axis=1)
                combined_df.to_excel(writer, sheet_name=pdf_filename[:31], index=False, header=False)
                data_written = True
                print(f"Tables from '{file_name}' added to '{excel_file}' in sheet '{pdf_filename[:31]}'.")
            else:
                print(f"No tables found in '{file_name}'. Skipping.")

    # Create a dummy sheet if no data was written
    if not data_written:
        empty_df = pd.DataFrame(["No tables found in any PDF."])
        empty_df.to_excel(writer, sheet_name="No Data", index=False, header=False)

print(f"Processing complete. All results saved in one Excel file at: {folder_path}")


Tables from 'PROTOCOL.PYTHON.EXCEL01.pdf' added to 'E:/EEG_notebook/EEG_pdf_files_to_Excel/study pdfs\test1.xlsx' in sheet 'PROTOCOL.PYTHON.EXCEL01'.
Tables from 'PROTOCOL.PYTHON.EXCEL02.pdf' added to 'E:/EEG_notebook/EEG_pdf_files_to_Excel/study pdfs\test1.xlsx' in sheet 'PROTOCOL.PYTHON.EXCEL02'.
Tables from 'PROTOCOL.PYTHON.EXCEL03.pdf' added to 'E:/EEG_notebook/EEG_pdf_files_to_Excel/study pdfs\test1.xlsx' in sheet 'PROTOCOL.PYTHON.EXCEL03'.
Processing complete. All results saved in one Excel file at: E:/EEG_notebook/EEG_pdf_files_to_Excel/study pdfs
