In [1]:
import pdfplumber
import pandas as pd
import numpy as np

with pdfplumber.open("../DATA/Zechlau2022.pdf") as pdf:
    page = pdf.pages[6]
    zechtable = page.extract_table()

title=" ".join([x.split("\n") for x in zechtable[0]][0])
rawcolumns=zechtable[1][0].split(" ")
columns=["Model", "γ_LT", "σ_LT", 'γ_IAV', "σ_IAV"]
zdata=zechtable[2:][0][0].split()
ncol=len(columns)

cmip5_list = [item for item in zdata[1:zdata.index("CMIP6")] if item != '±']
L=len(cmip5_list)

nrows=int(L/ncol)
cmip5_gammas=pd.DataFrame(np.array(cmip5_list).reshape((nrows,ncol)),columns=columns)

cmip6_list = [item for item in zdata[zdata.index("CMIP6")+1:zdata.index("OBS")] if item != '±']
cmip6_list
L6=len(cmip6_list)
rows6=int(L6/ncol)
cmip6_strings=np.array(cmip6_list).reshape((rows6,ncol))
cmip6_gammas=pd.DataFrame(cmip6_strings,columns=columns)


# now need to do some more data cleaning :(
mystr=cmip6_gammas["γ_LT"].values[0]
# For some reason it can't read the minus sign from the pdf- replace by hand
badval=mystr[0]


for c in columns[1:]:
    try:
        cmip6_gammas[c]=pd.to_numeric(cmip6_gammas[c])
    except:
        
        cmip6_gammas[c]=[float(x.replace(badval,"-")) for x in cmip6_gammas[c]]

#from pdf2image import convert_from_path
#images = convert_from_path("DATA/Zechlau2022.pdf", first_page=7, last_page=7)
#images[0].save("temp_page.jpg", "JPEG")

In [2]:
cmip6_gammas

Unnamed: 0,Model,γ_LT,σ_LT,γ_IAV,σ_IAV
0,ACCESS-ESM1-5,-30.01,5.7,-3.23,0.57
1,CanESM5,-56.38,4.4,-7.1,0.94
2,CESM2,-26.86,3.6,-2.89,0.39
3,CNRM-ESM2-1,-72.87,5.7,-6.07,0.99
4,GFDL-ESM4,-93.55,6.1,-8.22,0.87
5,NorESM2-LM,-35.44,3.0,-2.64,0.39
6,UKESM1-0-LL,-47.93,3.6,-5.13,0.62


In [13]:
import pandas as pd
from gmft.pdf_bindings import PyPDFium2Document
from gmft.auto import AutoTableFormatter, AutoTableDetector

# --- CONFIGURATION ---
PDF_PATH = "../DATA/Zechlau2022.pdf"
PAGE_NUM = 7 

# --- 1. LOAD PDF ---
doc = PyPDFium2Document(PDF_PATH)
# Note: PyPDFium uses 0-based indexing, so Page 7 is index 6
page = doc.get_page(PAGE_NUM - 1) 

# --- 2. INITIALIZE MODELS ---
print("Initializing Table Detector and Formatter...")
detector = AutoTableDetector()
formatter = AutoTableFormatter()

# --- 3. DETECT TABLES ---
# This finds the coordinates (bounding boxes) of tables on the page
detected_tables = detector.extract(page)

if detected_tables:
    print(f"Found {len(detected_tables)} table(s). Processing the first one...")
    
    # --- 4. FORMAT / EXTRACT DATA ---
    # Take the first detected table and pass it to the formatter
    # The formatter takes the cropped table image and converts it to a DataFrame
    first_table = detected_tables[0]
    formatted_table = formatter.extract(first_table)
    
    df = formatted_table.df()
    
    print("\n--- EXTRACTED DATAFRAME ---")
    print(df.to_string())
    
    # Optional: Save to CSV
    # df.to_csv("extracted_table.csv", index=False)

else:
    print("No tables were detected on this page.")

doc.close()

Initializing Table Detector and Formatter...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/115M [00:00<?, ?B/s]

Found 1 table(s). Processing the first one...

--- EXTRACTED DATAFRAME ---
   Ensemble                     Model    γLT (GtC/K) γIAV (GtC/yr/K)
0     CMIP5                   CanESM2   −74.21 ± 7.2    −9.87 ± 0.86
1      None                 CESM1-BGC    −7.27 ± 7.0    −1.37 ± 1.16
2      None                GFDL-ESM2M  −134.95 ± 9.3   −16.96 ± 2.06
3      None                 HadGEM-ES   −64.60 ± 6.4    −7.77 ± 0.72
4      None               IPSL-CM5A-R   −36.61 ± 4.7    −6.72 ± 1.14
5      None                NorESM1-ME    −7.47 ± 4.3    −3.34 ± 1.14
6     CMIP6             ACCESS-ESM1-5   −30.01 ± 5.7    −3.23 ± 0.57
7      None                   CanESM5   −56.38 ± 4.4    −7.10 ± 0.94
8      None                     CESM2   −26.86 ± 3.6    −2.89 ± 0.39
9      None               CNRM-ESM2-1   −72.87 ± 5.7    −6.07 ± 0.99
10     None                 GFDL-ESM4   −93.55 ± 6.1    −8.22 ± 0.87
11     None                NorESM2-LM   −35.44 ± 3.0    −2.64 ± 0.39
12     None               UK

In [22]:
import pandas as pd
import re
from gmft.pdf_bindings import PyPDFium2Document
from gmft.auto import AutoTableFormatter, AutoTableDetector

# --- 1. GENERIC CLEANING FUNCTION ---
def auto_clean_scientific_table(df):
    """
    Scans a dataframe. If a column contains '±', it splits it into 
    two numeric columns (Mean and Std/Error).
    Also fixes PDF specific dash/minus issues.
    """
    
    # A. Fix "Grouped" columns (like Ensemble) where values are implied by empty space
    # If the first column has empty strings/None, forward fill them.
    if df.iloc[:, 0].isnull().any() or (df.iloc[:, 0] == '').any():
        df.iloc[:, 0] = df.iloc[:, 0].replace(r'^\s*$', pd.NA, regex=True).ffill()

    # Iterate over every column to check for "±"
    for col in df.columns:
        # Convert to string to check content
        col_data = df[col].astype(str)
        
        # B. DETECT: Does this column contain '±'?
        if col_data.str.contains('±').any():
            print(f"-> Detected error bars in column: '{col}'. Processing...")
            
            # C. CLEAN: Replace all weird PDF dashes (En Dash, Em Dash, Minus) with Hyphen
            # \u00B1 is '±'
            clean_series = col_data.str.replace(r'[−–—]', '-', regex=True)
            
            # D. SPLIT: Divide into two parts
            split_data = clean_series.str.split('±', expand=True)
            
            # E. FORMAT: Create new column names dynamically
            # Example: "γ_LT" becomes "γ_LT_value" and "γ_LT_error"
            new_col_val = f"{col}_value"
            new_col_err = f"{col}_error"
            
            # F. ASSIGN & CONVERT: Force to numeric, turn errors (like headings) into NaN
            df[new_col_val] = pd.to_numeric(split_data[0], errors='coerce')
            df[new_col_err] = pd.to_numeric(split_data[1], errors='coerce')
            
            # Drop the original dirty column
            df = df.drop(columns=[col])

    # Reorder: Put the new columns next to each other (Optional, for readability)
    return df

# --- 2. EXTRACTION PIPELINE (Standard) ---
PDF_PATH = "../DATA/Zechlau2022.pdf"
PAGE_NUM = 7 

doc = PyPDFium2Document(PDF_PATH)
page = doc.get_page(PAGE_NUM - 1) 

detector = AutoTableDetector()
formatter = AutoTableFormatter()

detected_tables = detector.extract(page)

if detected_tables:
    # Get raw dataframe
    raw_df = formatter.extract(detected_tables[0]).df()
    
    # --- 3. APPLY SMART CLEANING ---
    final_df = auto_clean_scientific_table(raw_df)
    
    # Filter for CMIP6 (Generic filter based on first column)
    # We look for rows where the first column string contains "CMIP6"
    #final_df = final_df[final_df.iloc[:, 0].astype(str).str.contains("CMIP6", na=False)]
    
    print("\n--- PROCESSED DYNAMIC DATAFRAME ---")
    print(final_df.to_string())
    
else:
    print("No tables found.")

-> Detected error bars in column: 'γLT (GtC/K)'. Processing...
-> Detected error bars in column: 'γIAV (GtC/yr/K)'. Processing...

--- PROCESSED DYNAMIC DATAFRAME ---
   Ensemble                     Model  γLT (GtC/K)_value  γLT (GtC/K)_error  γIAV (GtC/yr/K)_value  γIAV (GtC/yr/K)_error
0     CMIP5                   CanESM2             -74.21                7.2                  -9.87                   0.86
1     CMIP5                 CESM1-BGC              -7.27                7.0                  -1.37                   1.16
2     CMIP5                GFDL-ESM2M            -134.95                9.3                 -16.96                   2.06
3     CMIP5                 HadGEM-ES             -64.60                6.4                  -7.77                   0.72
4     CMIP5               IPSL-CM5A-R             -36.61                4.7                  -6.72                   1.14
5     CMIP5                NorESM1-ME              -7.47                4.3                  -3.34   

In [23]:
final_df

Unnamed: 0,Ensemble,Model,γLT (GtC/K)_value,γLT (GtC/K)_error,γIAV (GtC/yr/K)_value,γIAV (GtC/yr/K)_error
0,CMIP5,CanESM2,-74.21,7.2,-9.87,0.86
1,CMIP5,CESM1-BGC,-7.27,7.0,-1.37,1.16
2,CMIP5,GFDL-ESM2M,-134.95,9.3,-16.96,2.06
3,CMIP5,HadGEM-ES,-64.6,6.4,-7.77,0.72
4,CMIP5,IPSL-CM5A-R,-36.61,4.7,-6.72,1.14
5,CMIP5,NorESM1-ME,-7.47,4.3,-3.34,1.14
6,CMIP6,ACCESS-ESM1-5,-30.01,5.7,-3.23,0.57
7,CMIP6,CanESM5,-56.38,4.4,-7.1,0.94
8,CMIP6,CESM2,-26.86,3.6,-2.89,0.39
9,CMIP6,CNRM-ESM2-1,-72.87,5.7,-6.07,0.99


In [19]:
cmip6_gammas

Unnamed: 0,Model,γ_LT,σ_LT,γ_IAV,σ_IAV
0,ACCESS-ESM1-5,-30.01,5.7,-3.23,0.57
1,CanESM5,-56.38,4.4,-7.1,0.94
2,CESM2,-26.86,3.6,-2.89,0.39
3,CNRM-ESM2-1,-72.87,5.7,-6.07,0.99
4,GFDL-ESM4,-93.55,6.1,-8.22,0.87
5,NorESM2-LM,-35.44,3.0,-2.64,0.39
6,UKESM1-0-LL,-47.93,3.6,-5.13,0.62


In [24]:
import pandas as pd
import re
from gmft.pdf_bindings import PyPDFium2Document
from gmft.auto import AutoTableFormatter, AutoTableDetector

# --- CONFIGURATION ---
PDF_PATH = "../DATA/Zechlau2022.pdf"
PAGE_NUM = 7 

# --- 1. DYNAMIC CLEANING FUNCTION ---
def auto_clean_scientific_table(df):
    """
    1. Fills missing 'Group' labels (like CMIP5/CMIP6).
    2. Detects columns with '±'.
    3. Splits them into Value and Error columns.
    4. Fixes PDF-specific dash issues.
    """
    
    # A. Fix "Grouped" columns (Ensemble)
    # Scientific tables often write the group name only once (e.g., "CMIP5")
    # and leave the rows below it empty. We must "forward fill" these.
    # We treat empty strings and None as missing data, then fill downwards.
    if df.iloc[:, 0].isnull().any() or (df.iloc[:, 0] == '').any():
        df.iloc[:, 0] = df.iloc[:, 0].replace(r'^\s*$', pd.NA, regex=True).ffill()

    # Iterate over every column to check for "±"
    for col in df.columns:
        col_data = df[col].astype(str)
        
        # B. DETECT: Does this column contain '±'?
        if col_data.str.contains('±').any():
            print(f"-> Detected error bars in column: '{col}'. Processing...")
            
            # C. CLEAN: Replace ALL dash types (En Dash, Em Dash, Minus) with Hyphen
            clean_series = col_data.str.replace(r'[−–—]', '-', regex=True)
            
            # D. SPLIT: Divide into two parts
            split_data = clean_series.str.split('±', expand=True)
            
            # E. FORMAT: Create new column names dynamically
            new_col_val = f"{col}_value"
            new_col_err = f"{col}_error"
            
            # F. ASSIGN: Force to numeric, turn text (like headings) into NaN
            df[new_col_val] = pd.to_numeric(split_data[0], errors='coerce')
            df[new_col_err] = pd.to_numeric(split_data[1], errors='coerce')
            
            # Drop the original "dirty" column
            df = df.drop(columns=[col])

    return df

# --- 2. EXECUTION PIPELINE ---

# A. Load PDF
doc = PyPDFium2Document(PDF_PATH)
page = doc.get_page(PAGE_NUM - 1) # 0-indexed

# B. Initialize AI Models
print("Initializing Table Detector...")
detector = AutoTableDetector()
formatter = AutoTableFormatter()

# C. Detect Tables
detected_tables = detector.extract(page)

if detected_tables:
    print(f"Found {len(detected_tables)} table(s).")
    
    # D. Extract Raw Data
    # We take the first table found on the page
    raw_df = formatter.extract(detected_tables[0]).df()
    
    # E. Apply Logic
    final_df = auto_clean_scientific_table(raw_df)
    
    print("\n--- FINAL CLEANED DATAFRAME ---")
    print(final_df.to_string())
    
    # Optional: Save to CSV to verify in Excel
    # final_df.to_csv("cmip_data_clean.csv", index=False)
    
else:
    print("No tables detected on this page.")

doc.close()

Initializing Table Detector...
Found 1 table(s).
-> Detected error bars in column: 'γLT (GtC/K)'. Processing...
-> Detected error bars in column: 'γIAV (GtC/yr/K)'. Processing...

--- FINAL CLEANED DATAFRAME ---
   Ensemble                     Model  γLT (GtC/K)_value  γLT (GtC/K)_error  γIAV (GtC/yr/K)_value  γIAV (GtC/yr/K)_error
0     CMIP5                   CanESM2             -74.21                7.2                  -9.87                   0.86
1     CMIP5                 CESM1-BGC              -7.27                7.0                  -1.37                   1.16
2     CMIP5                GFDL-ESM2M            -134.95                9.3                 -16.96                   2.06
3     CMIP5                 HadGEM-ES             -64.60                6.4                  -7.77                   0.72
4     CMIP5               IPSL-CM5A-R             -36.61                4.7                  -6.72                   1.14
5     CMIP5                NorESM1-ME              -7.47

In [25]:
import pandas as pd
import re
from gmft.pdf_bindings import PyPDFium2Document
from gmft.auto import AutoTableFormatter, AutoTableDetector

# --- CONFIGURATION ---
PDF_PATH = "../DATA/Zechlau2022.pdf"
PAGE_NUM = 10

# --- 1. DYNAMIC CLEANING FUNCTION ---
def auto_clean_scientific_table(df):
    """
    1. Fills missing 'Group' labels (like CMIP5/CMIP6).
    2. Detects columns with '±'.
    3. Splits them into Value and Error columns.
    4. Fixes PDF-specific dash issues.
    """
    
    # A. Fix "Grouped" columns (Ensemble)
    # Scientific tables often write the group name only once (e.g., "CMIP5")
    # and leave the rows below it empty. We must "forward fill" these.
    # We treat empty strings and None as missing data, then fill downwards.
    if df.iloc[:, 0].isnull().any() or (df.iloc[:, 0] == '').any():
        df.iloc[:, 0] = df.iloc[:, 0].replace(r'^\s*$', pd.NA, regex=True).ffill()

    # Iterate over every column to check for "±"
    for col in df.columns:
        col_data = df[col].astype(str)
        
        # B. DETECT: Does this column contain '±'?
        if col_data.str.contains('±').any():
            print(f"-> Detected error bars in column: '{col}'. Processing...")
            
            # C. CLEAN: Replace ALL dash types (En Dash, Em Dash, Minus) with Hyphen
            clean_series = col_data.str.replace(r'[−–—]', '-', regex=True)
            
            # D. SPLIT: Divide into two parts
            split_data = clean_series.str.split('±', expand=True)
            
            # E. FORMAT: Create new column names dynamically
            new_col_val = f"{col}_value"
            new_col_err = f"{col}_error"
            
            # F. ASSIGN: Force to numeric, turn text (like headings) into NaN
            df[new_col_val] = pd.to_numeric(split_data[0], errors='coerce')
            df[new_col_err] = pd.to_numeric(split_data[1], errors='coerce')
            
            # Drop the original "dirty" column
            df = df.drop(columns=[col])

    return df

# --- 2. EXECUTION PIPELINE ---

# A. Load PDF
doc = PyPDFium2Document(PDF_PATH)
page = doc.get_page(PAGE_NUM - 1) # 0-indexed

# B. Initialize AI Models
print("Initializing Table Detector...")
detector = AutoTableDetector()
formatter = AutoTableFormatter()

# C. Detect Tables
detected_tables = detector.extract(page)

if detected_tables:
    print(f"Found {len(detected_tables)} table(s).")
    
    # D. Extract Raw Data
    # We take the first table found on the page
    raw_df = formatter.extract(detected_tables[0]).df()
    
    # E. Apply Logic
    final_df = auto_clean_scientific_table(raw_df)
    
    print("\n--- FINAL CLEANED DATAFRAME ---")
    print(final_df.to_string())
    
    # Optional: Save to CSV to verify in Excel
    # final_df.to_csv("cmip_data_clean.csv", index=False)
    
else:
    print("No tables detected on this page.")

doc.close()

Initializing Table Detector...
Found 1 table(s).
-> Detected error bars in column: 'GPP(2 × CO2)/ GPP(1 × CO2)'. Processing...
-> Detected error bars in column: 'Offset to initial amplitude, a0 (ppmv)'. Processing...
-> Detected error bars in column: 'CO2 sensitivity of amplitude, a (ppmv/ppmv)'. Processing...

--- FINAL CLEANED DATAFRAME ---
   Ensemble               Model  GPP(2 × CO2)/ GPP(1 × CO2)_value  GPP(2 × CO2)/ GPP(1 × CO2)_error  Offset to initial amplitude, a0 (ppmv)_value  Offset to initial amplitude, a0 (ppmv)_error  CO2 sensitivity of amplitude, a (ppmv/ppmv)_value  CO2 sensitivity of amplitude, a (ppmv/ppmv)_error
0     CMIP5             CanESM2                              1.16                             0.040                                          4.84                                          0.88                                              0.018                                              0.002
1     CMIP5           CESM1-BGC                              1.22  