<a href="https://colab.research.google.com/github/machoo-l/machoo-l.github.i/blob/master/examples/camelot-quickstart-notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Quickstart example

This notebook shows you how to quickly get started with [camelot](https://github.com/camelot-dev/camelot) .

**Usage:** Either upload PDFs or add a URL to a PDF in the specified cells.

In [None]:
# @title 🛠️ Install [camelot](https://github.com/camelot-dev/camelot)
!pip install camelot-py
# install tabulate (optional) only needed in this notebook for pretty display of results.
# !pip install tabulate

In [None]:
# Bootstrap and common imports
import os, sys, time

sys.path.insert(0, os.path.abspath(""))  # Prefer the local version if available

import camelot

print(f"Using camelot v{camelot.__version__}.")

In [None]:
# @title 📂 Create necessary directories and delete `sample_data` if exists

import os
import shutil
from pathlib import Path


# Function to delete a directory and its contents
def delete_directory(path):
    try:
        shutil.rmtree(path)
        print(f"Deleted directory: {path}")
    except FileNotFoundError:
        print(f"Directory not found: {path}")
    except Exception as e:
        print(f"Error deleting directory {path}: {e}")


# Delete /content/sample_data if it exists
sample_data_dir = Path("/content/sample_data")
if sample_data_dir.exists():
    print("Deleting /content/sample_data directory...")
    delete_directory(sample_data_dir)

# Create the necessary directories
os.makedirs("/content/output", exist_ok=True)
os.makedirs("/content/sample_pdfs", exist_ok=True)

# Define input and output directories
input_dir = Path("/content/sample_pdfs")
output_dir = Path("/content/output")

print("Directories set up complete.")
print(f"Input directory: {input_dir}")
print(f"Output directory: {output_dir}")

In [None]:
# @title 📤 Upload Files (Optional)

from google.colab import files

print("\nPlease upload your PDF files. They will be saved in /content/sample_pdfs")

# Upload files
uploaded = files.upload()

# Move uploaded files to /content/sample_pdfs and remove from /content
for filename in uploaded.keys():
    src_path = Path("/content") / filename
    dst_path = Path("/content/sample_pdfs") / filename
    if src_path.exists():
        shutil.move(str(src_path), str(dst_path))
        print(f"Moved {filename} to /content/sample_pdfs")
    else:
        print(f"Warning: {filename} not found in /content")

# Clean up any remaining PDF files in /content
for file in Path("/content").glob("*.pdf"):
    os.remove(file)
    print(f"Removed {file.name} from /content")

print("\nUpload and organization complete. Files are now only in /content/sample_pdfs")

# Verify contents of /content/sample_pdfs
print("\nContents of /content/sample_pdfs:")
print(os.listdir("/content/sample_pdfs"))

# Verify no PDF files in /content
print("\nChecking for PDF files in /content:")
content_pdfs = list(Path("/content").glob("*.pdf"))
if content_pdfs:
    print("Warning: Found these PDF files in /content:")
    for pdf in content_pdfs:
        print(f" - {pdf.name}")
else:
    print("No PDF files found in /content")

In [None]:
# @title ⬇📕 Download Sample .PDF Document (Optional)

# import os
import requests

# from pathlib import Path


def convert_github_url_to_raw(url):
    if "github.com" in url and "/blob/" in url:
        raw_url = url.replace("github.com", "raw.githubusercontent.com").replace(
            "/blob/", "/"
        )
        return raw_url
    else:
        return "Invalid GitHub URL"


# Sample .pdf data from GitHub
pdf_url = "https://github.com/camelot-dev/camelot/blob/master/docs/_static/pdf/foo.pdf"  # @param {type:"string"}

# Convert the GitHub URL to the raw content URL
pdf_url = convert_github_url_to_raw(pdf_url)

# Check if the URL is valid
if pdf_url == "Invalid GitHub URL":
    raise ValueError("The provided GitHub URL is invalid.")

# Create the /content/sample_pdfs directory if it doesn't exist
sample_pdfs_dir = Path("/content/sample_pdfs")
sample_pdfs_dir.mkdir(parents=True, exist_ok=True)

# Download the PDF
response = requests.get(pdf_url)
response.raise_for_status()  # Check if the request was successful

# Extract the filename from the URL
filename = os.path.basename(pdf_url)

# Specify the file path in the /content/sample_pdfs directory
pdf_file_path = sample_pdfs_dir / filename

# Save the file, overwriting if it already exists
with open(pdf_file_path, "wb") as file:
    file.write(response.content)

print(f"PDF file downloaded and saved to: {pdf_file_path}")

In [None]:
import logging
import pandas as pd

# from pathlib import Path
from pypdf import PdfReader
from IPython.display import display

# Set up logging
logging.getLogger("camelot").setLevel(logging.DEBUG)
logging.basicConfig(
    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
)


def process_pdf(pdf_file, output_dir):
    print(f"Processing {pdf_file.name}")
    logging.info(f"Processing {pdf_file.name}")

    # Verify PDF can be opened with PdfReader before processing
    try:
        reader = PdfReader(str(pdf_file))
        if len(reader.pages) == 0:
            raise ValueError(f"No pages found in PDF {pdf_file.name}")
    except Exception as e:
        print(f"Failed to open PDF {pdf_file.name} with PdfReader: {e}")
        logging.error(f"Failed to open PDF {pdf_file.name} with PdfReader: {e}")
        return

    # Read tables from the PDF using camelot
    try:
        tables = camelot.read_pdf(str(pdf_file))
    except Exception as e:
        print(f"Failed to read PDF {pdf_file.name}: {e}")
        logging.error(f"Failed to read PDF {pdf_file.name}: {e}")
        return

    if len(tables) == 0:
        print(f"No tables detected in {pdf_file.name}")
        logging.warning(f"No tables detected in {pdf_file.name}")
        return

    # Create a subdirectory for this PDF's output
    pdf_output_dir = output_dir / pdf_file.stem
    pdf_output_dir.mkdir(exist_ok=True)

    # Process individual tables
    for i, table in enumerate(tables):
        try:
            # Convert table to pandas DataFrame
            df = table.df

            # Display the DataFrame
            print(f"\nTable {i+1} from {pdf_file.name}:")
            display(df)

            # Save individual table to CSV
            csv_path = pdf_output_dir / f"{pdf_file.stem}_table_{i+1}.csv"
            df.to_csv(csv_path, index=False)
            print(f"Saved to {csv_path}")

            # Log parsing report for each table
            print(f"\nTable {i+1} Parsing Report:")
            logging.info(f"Table {i+1} Parsing Report:")
            print(table.parsing_report)
            logging.info(table.parsing_report)
        except Exception as e:
            print(f"Failed to process or save table {i+1} from {pdf_file.name}: {e}")
            logging.error(
                f"Failed to process or save table {i+1} from {pdf_file.name}: {e}"
            )


# Define input_dir and output_dir
input_dir = Path("/content/sample_pdfs")
output_dir = Path("/content/output")

print(f"Input directory: {input_dir}")
print(f"Output directory: {output_dir}")

# Ensure output directory exists
output_dir.mkdir(exist_ok=True)

# Process each PDF in the input directory
pdf_files = list(input_dir.glob("*.pdf"))
print(f"Found {len(pdf_files)} PDF files")

if len(pdf_files) == 0:
    print("No PDF files found in the input directory.")
    logging.warning("No PDF files found in the input directory.")
else:
    for pdf_file in pdf_files:
        process_pdf(pdf_file, output_dir)

    print("Processing complete. Check the 'output' folder for results.")
    logging.info("Processing complete. Check the 'output' folder for results.")

print("Script execution finished.")

In [None]:
# @title ⚙️ Core - Complex Tables (Loose Parameters) with Clean Output

# import os
# from pathlib import Path
import pandas as pd
import numpy as np
from tabulate import tabulate

# Create output directory if it doesn't exist
output_dir = Path("/content/output")
output_dir.mkdir(parents=True, exist_ok=True)

# Process all PDF files in the input directory
input_dir = Path("/content/sample_pdfs")
for pdf_file in input_dir.glob("*.pdf"):
    print(f"\nProcessing {pdf_file.name}")

    # Using 'network' parsing method with table_areas
    # tables_network = camelot.read_pdf(str(pdf_file), flavor="network",pages='4')

    if len(tables_network) == 0:
        # If no tables are detected, try using 'lattice' parser
        # tables_lattice = camelot.read_pdf(
        #     str(pdf_file), flavor="lattice", table_areas=["50,750,500,50"]
        # )
        # tables_lattice = camelot.read_pdf(
        #     str(pdf_file), flavor="network", pages='1,2,6,8,9'
        # )
        # tables_lattice = camelot.read_pdf(
        #     str(pdf_file), flavor="lattice", pages='4'
        # )
        # tables_lattice = camelot.read_pdf(
        #     str(pdf_file), flavor="stream", pages='5,7'
        # )
        tables_lattice = camelot.read_pdf(
            str(pdf_file), flavor="stream", pages='1,2,4,5,6,7,8,9'
        )

    # Checking the detected tables
    if len(tables_network) > 0:
        tables = tables_network
    elif len(tables_lattice) > 0:
        tables = tables_lattice
    else:
        tables = []

    # Exporting if tables are found
    if len(tables) > 0:
        # print(len(tables)) I PUT THIS IN HERE
        output_base = output_dir / pdf_file.stem
        # tables.export(
        #     f"{output_base}.csv", f="csv", compress=True
        # )  # export all tables to CSV
        # tables[0].to_csv(
        #     f"{output_base}_first_table.csv"
        # )  # Save the first table to CSV

        # Combine all tables into a single DataFrame
        combined_df = pd.concat([table.df for table in tables], ignore_index=True)

        # Export the combined DataFrame to a CSV file
        combined_df.to_csv(f"{output_base}_combined_tables.csv", index=True)

        df = tables[0].df  # Get the first table as a pandas DataFrame

        print(f"Tables found in {pdf_file.name}:")

        # Clean up the DataFrame
        df = df.applymap(
            lambda x: x.strip() if isinstance(x, str) else x
        )  # Remove leading/trailing whitespace
        df = df.replace(["", "nan", "NaN", "NULL"], np.nan).dropna(
            how="all"
        )  # Remove empty rows
        df = df.fillna("")  # Replace NaN with empty string for display
        df = df.reset_index(drop=True)  # Reset index after dropping rows

        # Display the clean DataFrame
        print(tabulate(df, headers="keys", tablefmt="pretty", showindex=False))

        print(f"\nShape of the DataFrame: {df.shape}")
        print(f"\nParsing report: {tables[0].parsing_report}")
    else:
        print(f"No tables found in {pdf_file.name}")

print("\nProcessing complete. Check the output directory for results.")

In [None]:
# @title 🗑️ Clear Input & Output Directory

import shutil

# from pathlib import Path
# import os

# Define the directories to be cleared
directories_to_clear = ["/content/output", "/content/sample_pdfs"]

# Warning message
print("⚠️ WARNING: This will delete all contents of the following directories:")
for directory in directories_to_clear:
    print(f"- {directory}")

confirmation = input("Type 'YES' to confirm: ")

if confirmation == "YES":
    for directory in directories_to_clear:
        dir_path = Path(directory)
        if dir_path.exists() and dir_path.is_dir():
            # Remove all contents of the directory
            for item in dir_path.iterdir():
                if item.is_dir():
                    shutil.rmtree(item)
                else:
                    item.unlink()
            print(f"✅ All contents of '{directory}' have been deleted.")
        else:
            print(f"The '{directory}' directory does not exist.")
else:
    print("Operation cancelled. No files were deleted.")