In [4]:
import PyPDF2
import pdfplumber
import pandas as pd
import re
import os
import time
import psutil
import threading
import textract


def measure_extraction_performance_parallel(extraction_func, pdf_path):
    """
    Measure performance metrics for a given extraction method in parallel.
    
    Args:
        extraction_func (callable): Function to extract text.
        pdf_path (str): Path to PDF file.
    
    Returns:
        dict: Performance metrics for the extraction method.
    """
    metrics = {
        'extraction_time': 0,
        'memory_usage': 0,
        'cpu_usage': 0,
        'extracted_text_length': 0
    }
    
    # Function to monitor CPU and memory usage
    def monitor_performance():
        process = psutil.Process()
        while not stop_event.is_set():
            metrics['memory_usage'] = max(metrics['memory_usage'], process.memory_info().rss / (1024 * 1024))
            metrics['cpu_usage'] = max(metrics['cpu_usage'], psutil.cpu_percent(interval=0.1))
            time.sleep(0.1)

    # Start monitoring in a separate thread
    stop_event = threading.Event()
    monitor_thread = threading.Thread(target=monitor_performance)
    monitor_thread.start()

    # Perform the extraction
    start_time = time.time()
    extracted_text = extraction_func(pdf_path)
    metrics['extraction_time'] = time.time() - start_time
    metrics['extracted_text_length'] = len(extracted_text)

    # Stop monitoring and wait for the thread to finish
    stop_event.set()
    monitor_thread.join()

    return metrics


def extract_with_pypdf2(pdf_path):
    with open(pdf_path, 'rb') as pdf_file:
        reader = PyPDF2.PdfReader(pdf_file)
        return "".join(page.extract_text() for page in reader.pages)


def extract_with_pdfplumber(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return "".join(page.extract_text() for page in pdf.pages)


def extract_text_from_pdf(pdf_path):
    try:
        return textract.process(pdf_path).decode('utf-8')
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""


def clean_text(text, filename, method):
    """
    Clean extracted text and save to file
    
    Args:
        text (str): Raw extracted text
        filename (str): Name of the source file
        method (str): Extraction method used
    
    Returns:
        str: Cleaned text
    """
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove page numbers, headers, footers
    text = re.sub(r'\d+\s*[\|\-]\s*\d+', '', text)
    
    # Ensure method directory exists
    os.makedirs(method, exist_ok=True)
    
    # Write cleaned text to file
    f = open(os.path.join(method, filename[:-4] + ".txt"), "a", encoding="utf-8")
    f.write(text)
    f.close()
    
    return text


def process_pdf_directory(directory_path, performance_file="extraction_performance.csv"):
    """
    Process all PDFs in a directory with performance tracking
    
    Args:
        directory_path (str): Path to directory with PDFs
        performance_file (str): Path to save performance metrics
    
    Returns:
        pd.DataFrame: DataFrame with performance metrics
    """
    performance_results = []
    extraction_methods = [
        ("PyPDF2", extract_with_pypdf2),
        ("PDFPlumber", extract_with_pdfplumber),
        ("Textract", extract_text_from_pdf)
    ]

    for filename in os.listdir(directory_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(directory_path, filename)
            
            for method_name, extraction_func in extraction_methods:
                try:
                    # Measure performance
                    performance_metrics = measure_extraction_performance_parallel(extraction_func, pdf_path)
                    
                    # Extract and clean text
                    raw_text = extraction_func(pdf_path)
                    clean_text(raw_text, filename, method=f"./{method_name.lower()}/")
                    
                    # Append performance results
                    performance_results.append({
                        'Filename': filename,
                        'Extraction Method': method_name,
                        **performance_metrics
                    })
                    print(f'{filename} processed successfully with {method_name}')
                except Exception as e:
                    print(f"Error processing {filename} with {method_name}: {e}")

    # Save performance metrics
    performance_df = pd.DataFrame(performance_results)
    performance_df.to_csv(performance_file, index=False)
    print(f"Performance metrics saved to {performance_file}")
    
    return performance_df

# Example usage
pdf_directory = "../ESG REPORTS"
performance_metrics = process_pdf_directory(pdf_directory)
print(performance_metrics)


HugoBoss_2021.pdf processed successfully with PyPDF2
HugoBoss_2021.pdf processed successfully with PDFPlumber
HugoBoss_2021.pdf processed successfully with Textract
Aixtron_2021.pdf processed successfully with PyPDF2
Aixtron_2021.pdf processed successfully with PDFPlumber
Aixtron_2021.pdf processed successfully with Textract
HugoBoss_2018.pdf processed successfully with PyPDF2
HugoBoss_2018.pdf processed successfully with PDFPlumber
HugoBoss_2018.pdf processed successfully with Textract
Dürr_2017.pdf processed successfully with PyPDF2
Dürr_2017.pdf processed successfully with PDFPlumber
Dürr_2017.pdf processed successfully with Textract
Zalando_2023.pdf processed successfully with PyPDF2
Zalando_2023.pdf processed successfully with PDFPlumber
Zalando_2023.pdf processed successfully with Textract
Metro_2020.pdf processed successfully with PyPDF2
Metro_2020.pdf processed successfully with PDFPlumber
Metro_2020.pdf processed successfully with Textract
Aixtron_2023.pdf processed successful

ignore '/Perms' verify failed


Zalando_2022.pdf processed successfully with Textract


ignore '/Perms' verify failed


GEAGroup_2023.pdf processed successfully with PyPDF2


Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data
Data-loss while decompressing corrupted data


Error processing GEAGroup_2023.pdf with PDFPlumber: Invalid dictionary construct: [/'FB', /b'tru', /b'e', /'SW', /'N']
GEAGroup_2023.pdf processed successfully with Textract
Error processing DeutscheBank_2017.pdf with PyPDF2: list index out of range
DeutscheBank_2017.pdf processed successfully with PDFPlumber
DeutscheBank_2017.pdf processed successfully with Textract
Metro_2018.pdf processed successfully with PyPDF2
Metro_2018.pdf processed successfully with PDFPlumber
Metro_2018.pdf processed successfully with Textract
CompuGroup_2022.pdf processed successfully with PyPDF2
CompuGroup_2022.pdf processed successfully with PDFPlumber
CompuGroup_2022.pdf processed successfully with Textract
CompuGroup_2023.pdf processed successfully with PyPDF2
CompuGroup_2023.pdf processed successfully with PDFPlumber
CompuGroup_2023.pdf processed successfully with Textract
Zalando_2021.pdf processed successfully with PyPDF2
Zalando_2021.pdf processed successfully with PDFPlumber
Zalando_2021.pdf process