In [6]:
pip install pdfplumber pandas

Defaulting to user installation because normal site-packages is not writeable
Collecting pdfplumber
  Downloading pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
INFO: pip is looking at multiple versions of pdfplumber to determine which version is compatible with other requirements. This could take a while.
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
Collecting pdfminer.six==20251107 (from pdfplumber)
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.3.0-py3-none-macosx_11_0_arm64.whl.metadata (67 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six==20251107->pdfplumber)
  Downloading cryptography-46.0.3-cp38-abi3-macosx_10_9_universal2.whl.metadata (5.7 kB)
Collecting cffi>=2.0.0 (from cryptography>=36.0.0->pdfminer.six==20251107->pdfplumber)
  Downloading cffi-2.0.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (2.6 kB)
Collecting typing-extensions>=4.13.

In [24]:
pip install 'camelot-py[cv]'

Defaulting to user installation because normal site-packages is not writeable
Collecting camelot-py[cv]
  Downloading camelot_py-1.0.9-py3-none-any.whl.metadata (9.8 kB)
[0mCollecting click>=8.0.1 (from camelot-py[cv])
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting chardet>=5.1.0 (from camelot-py[cv])
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting pypdf<4.0,>=3.17 (from camelot-py[cv])
  Downloading pypdf-3.17.4-py3-none-any.whl.metadata (7.5 kB)
Collecting tabulate>=0.9.0 (from camelot-py[cv])
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting opencv-python-headless>=4.7.0.68 (from camelot-py[cv])
  Downloading opencv_python_headless-4.13.0.90-cp37-abi3-macosx_13_0_arm64.whl.metadata (19 kB)
Collecting numpy>=1.24.4 (from camelot-py[cv])
  Downloading numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl.metadata (60 kB)
Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
Downloading click-8.1.8-py3-none

In [46]:
import camelot

tables = camelot.read_pdf('../data/PDF_version_1000/15_9_19_A_1997_07_30.pdf', pages='all', flavor='lattice')

for i, table in enumerate(tables):
    table.to_csv(f'table_{i}.csv')

In [45]:
import pdfplumber

with pdfplumber.open('../data/PDF_version_1000/15_9_19_A_1997_07_30.pdf') as pdf:
    text = pdf.pages[0].extract_text()

    start = text.find('Summary of activities')
    end = text.find('Operations')

    print(text[start:end])

Summary of activities (24 Hours)
FINISHED HANDLING MILLING BHA. MU & TIH WITH DRILLING BHA TO TD.
DRILLED FROM 2202 - 2405 M. SLIDING AS DIRECTIONAL PROGRAM REQUIRED
TO DROP ANGLE & TURN HOLE TOWARD OBJECTIVE.
Summary of planned activities (24 Hours)
CONTINUE DRILLING 8 1/2" HOLE.



In [57]:
import pdfplumber
import cv2
import numpy as np
import re

def extract_sections_with_cv(pdf_file):
    """Extract sections using computer vision with quality filters"""

    sections = []

    with pdfplumber.open(pdf_file) as pdf:
        for page_num, page in enumerate(pdf.pages, 1):

            # Convert page to image
            img = page.to_image(resolution=150)
            pil_img = img.original

            # Convert to OpenCV format
            opencv_img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)

            # Convert to HSV for better color detection
            hsv = cv2.cvtColor(opencv_img, cv2.COLOR_BGR2HSV)

            # Define range for gray color (section headers)
            lower_gray = np.array([0, 0, 100])
            upper_gray = np.array([180, 50, 220])

            # Create mask for gray regions
            mask = cv2.inRange(hsv, lower_gray, upper_gray)

            # Find contours (gray boxes)
            contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

            # Filter contours by size
            for contour in contours:
                x, y, w, h = cv2.boundingRect(contour)

                # Only keep large horizontal boxes (section headers)
                # Section headers are typically: wide, not too tall
                if w > 200 and 15 < h < 40:

                    # Convert to PDF coordinates
                    scale = page.width / opencv_img.shape[1]

                    pdf_x0 = x * scale
                    pdf_y0 = y * scale
                    pdf_x1 = (x + w) * scale
                    pdf_y1 = (y + h) * scale

                    # Extract text
                    try:
                        cropped = page.crop((pdf_x0, pdf_y0, pdf_x1, pdf_y1))
                        text = cropped.extract_text()

                        if text and len(text.strip()) > 3:
                            text = text.strip()

                            # QUALITY FILTERS
                            # 1. Check for duplicate characters (TTiimmee)
                            if has_duplicate_chars(text):
                                continue

                            # 2. Must not be too long (real headers are concise)
                            if len(text) > 100:
                                continue

                            # 3. Should not have too many numbers/special chars
                            if count_special_chars(text) > len(text) * 0.3:
                                continue

                            sections.append({
                                'page': page_num,
                                'text': text,
                                'y_position': pdf_y0,
                                'height': h
                            })
                    except:
                        continue

    # Remove duplicates (keep first occurrence)
    seen = set()
    unique_sections = []
    for s in sections:
        if s['text'] not in seen:
            seen.add(s['text'])
            unique_sections.append(s)

    # Sort by page first, then by y_position (top to bottom)
    unique_sections.sort(key=lambda x: (x['page'], x['y_position']))

    return unique_sections

def has_duplicate_chars(text):
    """Check if text has duplicate consecutive characters (OCR error)"""
    duplicate_count = 0
    for i in range(len(text) - 1):
        if text[i] == text[i + 1] and text[i].isalpha():
            duplicate_count += 1

    return duplicate_count > len(text) * 0.2

def count_special_chars(text):
    """Count special characters and numbers"""
    return sum(1 for c in text if not c.isalpha() and not c.isspace())

# Test
pdf_file = '../data/PDF_version_1000/15_9_19_A_1997_07_30.pdf'
sections = extract_sections_with_cv(pdf_file)

print(f"Found {len(sections)} valid sections:\n")
for s in sections:
    print(f"Page {s['page']}, Y={s['y_position']:.1f}: {s['text']}")

Found 8 valid sections:

Page 1, Y=244.2: Summary of activities (24 Hours)
Page 1, Y=296.1: Summary of planned activities (24 Hours)
Page 1, Y=333.0: Operations
Page 1, Y=506.2: Drilling Fluid
Page 2, Y=83.0: Pore Pressure
Page 2, Y=141.1: Survey Station
Page 2, Y=224.1: Lithology Information
Page 2, Y=282.6: Gas Reading Information


In [54]:
path = "../data/PDF_version_1000/15_9_19_A_1997_07_30.pdf"

In [55]:
import camelot

tables = camelot.read_pdf(path, pages='all', flavor='lattice')

for i, table in enumerate(tables):
    table.to_csv(f'table_{i}.csv')

In [59]:
import pdfplumber
import cv2
import numpy as np
import camelot
import re

def extract_sections_with_cv(pdf_file):
    """Extract sections with their positions"""

    sections = []

    with pdfplumber.open(pdf_file) as pdf:
        for page_num, page in enumerate(pdf.pages, 1):

            img = page.to_image(resolution=150)
            pil_img = img.original
            opencv_img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
            hsv = cv2.cvtColor(opencv_img, cv2.COLOR_BGR2HSV)

            lower_gray = np.array([0, 0, 100])
            upper_gray = np.array([180, 50, 220])
            mask = cv2.inRange(hsv, lower_gray, upper_gray)
            contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

            for contour in contours:
                x, y, w, h = cv2.boundingRect(contour)

                if w > 200 and 15 < h < 40:
                    scale = page.width / opencv_img.shape[1]
                    pdf_y = y * scale

                    try:
                        cropped = page.crop((x * scale, pdf_y, (x + w) * scale, (y + h) * scale))
                        text = cropped.extract_text()

                        if text and len(text.strip()) > 3:
                            text = text.strip()
                            if has_duplicate_chars(text) or len(text) > 100:
                                continue

                            sections.append({
                                'page': page_num,
                                'text': text,
                                'y_position': pdf_y,
                                'page_height': page.height
                            })
                    except:
                        continue

    # Remove duplicates and sort
    seen = set()
    unique_sections = []
    for s in sections:
        if s['text'] not in seen:
            seen.add(s['text'])
            unique_sections.append(s)

    unique_sections.sort(key=lambda x: (x['page'], x['y_position']))
    return unique_sections

def has_duplicate_chars(text):
    duplicate_count = 0
    for i in range(len(text) - 1):
        if text[i] == text[i + 1] and text[i].isalpha():
            duplicate_count += 1
    return duplicate_count > len(text) * 0.2

def extract_tables_by_sections(pdf_file):
    """Extract tables using section regions"""

    # Get sections
    sections = extract_sections_with_cv(pdf_file)

    print(f"Found {len(sections)} sections\n")

    # Get page info
    with pdfplumber.open(pdf_file) as pdf:
        page_width = pdf.pages[0].width
        page_height = pdf.pages[0].height

    # Extract common tables (from top of page 1 to first section)
    if sections:
        first_section = sections[0]
        if first_section['page'] == 1:
            print("=" * 70)
            print("EXTRACTING COMMON TABLES (before first section)")
            print("=" * 70)

            # Region from top to first section
            y1 = page_height  # Top of page
            y2 = page_height - first_section['y_position'] + 10  # First section
            x1 = 0
            x2 = page_width

            region = f'{x1},{y1},{x2},{y2}'
            print(f"Page 1: Common Report Info")
            print(f"  Region: {region}")

            try:
                tables = camelot.read_pdf(
                    pdf_file,
                    pages='1',
                    flavor='lattice',
                    table_regions=[region]
                )

                # Save all common tables
                for idx, table in enumerate(tables):
                    filename = f'common_{idx + 1}.csv'
                    table.to_csv(filename)
                    print(f"  ✓ Saved: {filename} ({table.df.shape})")

                if len(tables) == 0:
                    print(f"  ✗ No table found")

            except Exception as e:
                print(f"  ✗ Error: {e}")

            print()

    # Extract tables for each section
    print("=" * 70)
    print("EXTRACTING SECTION TABLES")
    print("=" * 70)
    print()

    for i, section in enumerate(sections):
        page_num = section['page']
        page_height = section['page_height']

        # Section Y position (from top)
        section_y_top = section['y_position']

        # Find next section on same page or use page bottom
        if i + 1 < len(sections) and sections[i + 1]['page'] == page_num:
            next_y = sections[i + 1]['y_position']
        else:
            next_y = page_height

        # Convert to PDF coordinates (from bottom-left)
        y1 = page_height - section_y_top - 20  # Start below section header
        y2 = page_height - next_y + 10  # End at next section

        # Full width
        x1 = 0
        x2 = page_width

        # Create region string
        region = f'{x1},{y1},{x2},{y2}'

        print(f"Page {page_num}: {section['text']}")
        print(f"  Region: {region}")

        # Extract table in this region
        try:
            tables = camelot.read_pdf(
                pdf_file,
                pages=str(page_num),
                flavor='lattice',
                table_regions=[region]
            )

            if len(tables) > 0:
                # Normalize section name for filename
                name = re.sub(r'\([^)]*\)', '', section['text'])
                name = name.lower().strip()
                name = re.sub(r'[^\w\s-]', '', name)
                name = re.sub(r'[-\s]+', '_', name)

                filename = f'{name}.csv'
                tables[0].to_csv(filename)
                print(f"  ✓ Saved: {filename} ({tables[0].df.shape})")
            else:
                print(f"  ✗ No table found")
        except Exception as e:
            print(f"  ✗ Error: {e}")

        print()

# Run
pdf_file = '../data/PDF_version_1000/15_9_19_A_1997_07_30.pdf'
extract_tables_by_sections(pdf_file)

Found 8 sections

EXTRACTING COMMON TABLES (before first section)
Page 1: Common Report Info
  Region: 0,842,595,607.7620967741935
  ✓ Saved: common_1.csv ((7, 3))
  ✓ Saved: common_2.csv ((10, 3))
  ✓ Saved: common_3.csv ((17, 3))

EXTRACTING SECTION TABLES

Page 1: Summary of activities (24 Hours)
  Region: 0,577.7620967741935,595,555.9395161290322
  ✗ No table found

Page 1: Summary of planned activities (24 Hours)
  Region: 0,525.9395161290322,595,518.991935483871
  ✗ No table found

Page 1: Operations
  Region: 0,488.991935483871,595,345.77016129032256
  ✓ Saved: operations.csv ((13, 6))

Page 1: Drilling Fluid
  Region: 0,315.77016129032256,595,10
  ✓ Saved: drilling_fluid.csv ((29, 3))

Page 2: Pore Pressure
  Region: 0,738.9879032258065,595,710.9274193548387
  ✗ No table found

Page 2: Survey Station
  Region: 0,680.9274193548387,595,627.9153225806451
  ✗ No table found

Page 2: Lithology Information
  Region: 0,597.9153225806451,595,569.375
  ✗ No table found

Page 2: Gas Read

In [75]:
import pandas as pd
import json
import numpy as np

def csv_to_json(csv_file="common_3.csv"):
    """Convert CSV to JSON - auto-detect structure"""

    df = pd.read_csv(csv_file)

    # Check if columns are unnamed (Unnamed: 0, Unnamed: 1, etc.)
    has_unnamed = any('unnamed' in str(col).lower() for col in df.columns)

    if has_unnamed:
        # Key-value structure
        result = {}

        for _, row in df.iterrows():
            key = row.iloc[0]
            value = None

            # Get first non-null value
            for i in range(1, len(row)):
                if pd.notna(row.iloc[i]):
                    value = row.iloc[i]
                    break

            if isinstance(key, str):
                key = key.rstrip(':').strip()
                key = key.lower().replace(' ', '_').replace('(', '').replace(')', '').replace('/', '_')
                result[key] = value

        return result

    else:
        # Tabular structure - keep as is
        # Replace NaN with None for proper JSON
        df = df.replace({np.nan: None})
        return df.to_dict(orient='records')

# Test with key-value CSV
print("Key-Value (Unnamed columns):")
data1 = csv_to_json("common_3.csv")
print(json.dumps(data1, indent=2))

print("\n" + "="*70 + "\n")

# Test with tabular CSV
print("Tabular (Named columns):")
data2 = csv_to_json("operations.csv")
print(json.dumps(data2[:2], indent=2))  # Show first 2 rows

Key-Value (Unnamed columns):
{
  "status": "normal",
  "report_creation_time": "2018-05-03 13:53",
  "report_number": "6",
  "days_ahead_behind_+_-": null,
  "operator": "Statoil",
  "rig_name": "BYFORD DOLPHIN",
  "drilling_contractor": null,
  "spud_date": "1997-07-25 00:00",
  "wellbore_type": null,
  "elevation_rkb-msl_m": "25",
  "water_depth_msl_m": "84",
  "tight_well": "Y",
  "hpht": "Y",
  "temperature_": null,
  "pressure_": null,
  "date_well_complete": "1997-08-30"
}


Tabular (Named columns):
[
  {
    "Start\ntime": "00:00",
    "End\ntime": "01:00",
    "End Depth\nmMD": 2202,
    "Main - Sub Activity": "interruption -- sidetrack",
    "State": "ok",
    "Remark": "FINISHED DRILLING TO 2202 M - TOTAL OF 19 M OF FORMATION DRILLED BEYOND WINDOW'S BOTTOM. PU & SO THROUGH WINDOW 5 TIMES - N\nO PROBLEMS."
  },
  {
    "Start\ntime": "01:00",
    "End\ntime": "02:30",
    "End Depth\nmMD": 2175,
    "Main - Sub Activity": "drilling -- circulating cond\nitioning",
    "State": 