In [None]:
from pathlib import Path
import re

PDFBOT_CLEANED = Path("../PDFBoT/output/extracted_texts")
CLEANED_DIR = Path("../regex_cleaning/cleaned_papers")
GROUND_TRUTH_DIR = Path("../regex_cleaning/ground_truth")
ORIGINAL_PAPERS = Path("../regex_cleaning/txt_dataset")

# Count frequency of keywords in cleaned texts to compare methods
def count_pattern_frequency(directory, pattern):
    total_count = 0

    for file_path in directory.glob("**/*"):  # Recursively check all files
        if file_path.is_file():
            try:
                with file_path.open("r", encoding="utf-8") as file:
                    text = file.read()
                    matches = pattern.findall(text)
                    total_count += len(matches)
            except Exception as e:
                print(f"Could not read file {file_path}: {e}")
    
    return total_count

# Regex patterns
figure_pattern = re.compile(r'^\s*(fig|figure)\s+\d+\.', re.IGNORECASE | re.MULTILINE)
acknowledgments_pattern = re.compile(r'\backnowledg(e)?ments?\b', re.IGNORECASE)
table_pattern = re.compile(r'^\s*table\s+\d+\.', re.IGNORECASE | re.MULTILINE)
references_pattern = re.compile(r'^\s*references\b', re.IGNORECASE | re.MULTILINE)
email_and_url_pattern = re.compile(r'\b(?:https?://\S+|ChemRxiv\S*|[\w\.-]+@[\w\.-]+\.\w+)\b',re.IGNORECASE)

patterns = {
    "Figure": figure_pattern,
    "Acknowledgments": acknowledgments_pattern,
    "Table": table_pattern,
    "References": references_pattern,
    "Email and URL": email_and_url_pattern
}

for label, pattern in patterns.items():
    pdfbot_freq = count_pattern_frequency(PDFBOT_CLEANED, pattern)
    regex_freq = count_pattern_frequency(CLEANED_DIR, pattern)
    ground_truth_freq = count_pattern_frequency(GROUND_TRUTH_DIR, pattern)
    original_freq = count_pattern_frequency(ORIGINAL_PAPERS, pattern)
    print(f"{label} - Ground Truth: {ground_truth_freq}")
    print(f"{label} - PDFBOT: {pdfbot_freq}")
    print(f"{label} - Regex: {regex_freq}")
    print(f"{label} - Original: {original_freq}")



Figure - Ground Truth: 3
Figure - PDFBOT: 41
Figure - Regex: 1
Figure - Original: 74
Acknowledgments - Ground Truth: 1
Acknowledgments - PDFBOT: 7
Acknowledgments - Regex: 3
Acknowledgments - Original: 16
Table - Ground Truth: 1
Table - PDFBOT: 6
Table - Regex: 0
Table - Original: 7
References - Ground Truth: 0
References - PDFBOT: 3
References - Regex: 0
References - Original: 20
Email and URL - Ground Truth: 1
Email and URL - PDFBOT: 82
Email and URL - Regex: 0
Email and URL - Original: 1788


In [26]:
from pathlib import Path
import re

# count number of lines after keyword "References"
def count_lines_after_references(directory):
   
    total_lines = 0
    references_pattern = re.compile(r'^\s*references\b', re.IGNORECASE | re.MULTILINE)
    
    for file_path in directory.glob("**/*"):
        if file_path.is_file() and file_path.suffix.lower() in ('.txt', '.md', ''):
            try:
                with file_path.open("r", encoding="utf-8") as file:
                    text = file.read()
                    
                    # match with the regex pattern
                    matches = list(references_pattern.finditer(text))
                    if matches:
                        # Take the last occurrence of 'References'
                        last_ref = matches[-1]
                        # Get all text after the last 'References'
                        text_after = text[last_ref.end():]
                        # Count non-empty lines
                        lines = [line for line in text_after.split('\n') if line.strip()]
                        total_lines += len(lines)
                        
            except Exception as e:
                print(f"Could not process file {file_path}: {e}")
    
    return total_lines

PDFBOT_CLEANED = Path("../PDFBoT/output/extracted_texts")
lines_after_ref = count_lines_after_references(PDFBOT_CLEANED)
print(f"Total lines after 'References' in PDFBoT_CLEANED: {lines_after_ref}")

Total lines after 'References' in PDFBoT_CLEANED: 180
