In [None]:
from pathlib import Path

input_pdf_path = Path("../data/raw/pdf/from-script")
output_tei_path = Path("../data/interim/tei/")

# Check input path
if not input_pdf_path.exists():
    raise FileNotFoundError(f"Input path does not exist: {input_pdf_path}")

if not input_pdf_path.is_dir():
    raise NotADirectoryError(f"Input path is not a directory: {input_pdf_path}")

# Check / create output path
if not output_tei_path.exists():
    output_tei_path.mkdir(parents=True, exist_ok=True)

if not output_tei_path.is_dir():
    raise NotADirectoryError(f"Output path is not a directory: {output_tei_path}")

print("Input and output paths are valid.")


Input and output paths are valid.


In [2]:
grobid_server = "https://kermitt2-grobid.hf.space"

In [3]:
# Write python code that reads all filenames from folder FOLDER and for each filename runs the command
# curl -v -H "Accept: application/xml" --form consolidateCitations=1 --form includeRawCitations=1 --form segmentSentences=1 --form input=@./FILENAME localhost:8070/api/processFulltextDocument > FILENAME + "tei.xml"

In [4]:
import requests

def check_grobid_server():
    url = f"{grobid_server}/api/isalive"
    
    try:
        # Send a GET request to the URL
        # 
        print(f"checking {url}...")
        response = requests.get(url)
        print("checked!")

        # Check if the status code indicates success (200-299)
        if response.status_code == 200 and response.text == "true":
            print(f"Server is up and running at {url}.")
            return True
        else:
            print(f"Server responded with status code: {response.status_code}")
            return False
    
    except requests.ConnectionError:
        print(f"Unable to connect to {url}. The server may be down or unreachable.")
        return False


In [5]:
assert check_grobid_server()

checking https://kermitt2-grobid.hf.space/api/isalive...
checked!
Server is up and running at https://kermitt2-grobid.hf.space/api/isalive.


In [6]:
import os

from pathlib import Path

def truncate_filename(file_path, max_length=80):
    parts = file_path.rsplit("/", 1)

    # Handle case with no directory
    if len(parts) == 1:
        directory = ""
        filename = parts[0]
    else:
        directory, filename = parts

    # Split extension
    if "." in filename:
        stem, ext = filename.rsplit(".", 1)
        ext = "." + ext
    else:
        stem, ext = filename, ""

    # If already within limit, return unchanged
    if len(filename) <= max_length:
        return file_path

    max_stem_len = max_length - len(ext)
    truncated_filename = stem[:max_stem_len] + ext

    return (
        truncated_filename
        if directory == ""
        else f"{directory}/{truncated_filename}"
    )


assert truncate_filename("a/path/short_name.txt") == "a/path/short_name.txt"
assert truncate_filename("a/path/with_a_really_long_filename_that_needs_to_be_truncated_because_it_is_too_long.txt") == "a/path/with_a_really_long_filename_that_needs_to_be_truncated_because_it_is_too_lon.txt"


In [7]:
import os


# Make sure the output folder exists
if not os.path.exists(output_tei_path):
    os.makedirs(output_tei_path)

# List all files in the folder
files = []

for filename in os.listdir(input_pdf_path):
    file_path = os.path.join(input_pdf_path, filename)
    if os.path.isfile(file_path):
        # Add the file and its size to the list
        files.append((file_path, os.path.getsize(file_path)))

print(f"I have {len(files)} files to process!")


I have 1227 files to process!


In [8]:
import shlex

def print_command(curl_command):
    def quote(arg):
        arg = str(arg)
        if any(c in arg for c in ' \t\n"'):
            arg = arg.replace('"', r'\"')
            return f'"{arg}"'
        return arg

    return " ".join(quote(arg) for arg in curl_command)


In [9]:
import xml.etree.ElementTree as ET

def assert_valid_xml(file_path: str) -> None:
    """
    Lança exceção se o arquivo não contiver um XML bem-formado.
    Retorna None se o XML for válido.
    """
    ET.parse(file_path)


In [10]:
import subprocess
import time
import random
import os
from tqdm import tqdm 

# Sort files by size (smallest first)
files.sort(key=lambda x: x[1])
random.shuffle(files)

# Process each file in order of size com tqdm
for file_path, file_size in tqdm(files, desc="Processando arquivos", unit="file"):
    # Skip processing if the file size is zero
    if file_size == 0:
        tqdm.write(f"Skipped {file_path} (file size is zero)")
        continue

    # Extract the filename from the path
    filename = os.path.basename(file_path)

    # Prepare the output filename
    output_filename = f"{filename}.grobid.tei.xml"
    output_file_path = os.path.join(output_tei_path, output_filename)

    # Check if the output file exists and its size is greater than 0
    if not os.path.exists(output_file_path) or os.path.getsize(output_file_path) == 0:
        # Prepare the curl command
        #tqdm.write(f"Trying to extract {file_path} from PDF to TEI...")

        #file_path = truncate_filename(file_path)
            
        curl_command = [
            "curl", # "-v", 
            "-H", "Accept: application/xml", 
            "--form", "consolidateCitations=1", 
            "--form", "includeRawCitations=1", 
            "--form", "segmentSentences=1",
            "--form", "generateIDs=1", 
            "--form", f"input=@{file_path}", 
            f"{grobid_server}/api/processFulltextDocument",
        ]

        curl_command = print_command(curl_command)
        #print(curl_command)
        #print(output_file_path)

        # Open the output file to redirect the curl output into it
        with open(output_file_path, "w") as output_file:
            try:
                # Run the process with a timeout of 10 minutes (600 seconds)
                #result = subprocess.run(f"{curl_command} > {output_file_path}")
                result = subprocess.run(curl_command, stdout=output_file, stderr=subprocess.PIPE, timeout=600)
                if result.stderr:
                    a = 1
                    #tqdm.write("STDERR:", output_file)
                    #tqdm.write(result.stderr.decode())

            except subprocess.TimeoutExpired:
                tqdm.write("The process did not finish within 10 minutes and was terminated.")

        # Check if the output file size is zero
        if os.path.getsize(output_file_path) < 1000:
            tqdm.write(f"{output_file_path} is empty or small. NOT PROCESSED.\n\n")
            time.sleep(5)
        else:
           # tqdm.write(f"Processed succesfully {filename} -> {output_file_path}\n\n")
            assert_valid_xml(output_file_path)
            time.sleep(5)
    else:
        tqdm.write(f"Skipped {filename} (output file already exists and is non-empty)")


Processando arquivos:  39%|███▉      | 477/1227 [2:36:49<3:32:22, 16.99s/file] 

Skipped Mr-Hayek-and-the-Classics-A-Suggested-Interpretation-of-the-BusinessCycle-Theory-in-Prices-and-Production_2021_Ludwig-Von-Mises-Institute.pdf (output file already exists and is non-empty)


Processando arquivos: 100%|██████████| 1227/1227 [6:29:23<00:00, 19.04s/file]  
