In [11]:
from pathlib import Path

input_pdf_path = Path("../data/raw/pdf/from-script")
output_tei_path = Path("../data/interim/tei/")

# Check input path
if not input_pdf_path.exists():
    raise FileNotFoundError(f"Input path does not exist: {input_pdf_path}")

if not input_pdf_path.is_dir():
    raise NotADirectoryError(f"Input path is not a directory: {input_pdf_path}")

# Check / create output path
if not output_tei_path.exists():
    output_tei_path.mkdir(parents=True, exist_ok=True)

if not output_tei_path.is_dir():
    raise NotADirectoryError(f"Output path is not a directory: {output_tei_path}")

print("Input and output paths are valid.")


Input and output paths are valid.


In [12]:
grobid_server = "https://kermitt2-grobid.hf.space"

In [13]:
# Write python code that reads all filenames from folder FOLDER and for each filename runs the command
# curl -v -H "Accept: application/xml" --form consolidateCitations=1 --form includeRawCitations=1 --form segmentSentences=1 --form input=@./FILENAME localhost:8070/api/processFulltextDocument > FILENAME + "tei.xml"

In [14]:
import requests

def check_grobid_server():
    url = f"{grobid_server}/api/isalive"
    
    try:
        # Send a GET request to the URL
        # 
        print(f"checking {url}...")
        response = requests.get(url)
        print("checked!")

        # Check if the status code indicates success (200-299)
        if response.status_code == 200 and response.text == "true":
            print(f"Server is up and running at {url}.")
            return True
        else:
            print(f"Server responded with status code: {response.status_code}")
            return False
    
    except requests.ConnectionError:
        print(f"Unable to connect to {url}. The server may be down or unreachable.")
        return False


In [15]:
assert check_grobid_server()

checking https://kermitt2-grobid.hf.space/api/isalive...
checked!
Server is up and running at https://kermitt2-grobid.hf.space/api/isalive.


In [16]:
import os

from pathlib import Path

def truncate_filename(file_path, max_length=80):
    parts = file_path.rsplit("/", 1)

    # Handle case with no directory
    if len(parts) == 1:
        directory = ""
        filename = parts[0]
    else:
        directory, filename = parts

    # Split extension
    if "." in filename:
        stem, ext = filename.rsplit(".", 1)
        ext = "." + ext
    else:
        stem, ext = filename, ""

    # If already within limit, return unchanged
    if len(filename) <= max_length:
        return file_path

    max_stem_len = max_length - len(ext)
    truncated_filename = stem[:max_stem_len] + ext

    return (
        truncated_filename
        if directory == ""
        else f"{directory}/{truncated_filename}"
    )


assert truncate_filename("a/path/short_name.txt") == "a/path/short_name.txt"
assert truncate_filename("a/path/with_a_really_long_filename_that_needs_to_be_truncated_because_it_is_too_long.txt") == "a/path/with_a_really_long_filename_that_needs_to_be_truncated_because_it_is_too_lon.txt"


In [17]:
import os


# Make sure the output folder exists
if not os.path.exists(output_tei_path):
    os.makedirs(output_tei_path)

# List all files in the folder
files = []

for filename in os.listdir(input_pdf_path):
    file_path = os.path.join(input_pdf_path, filename)
    if os.path.isfile(file_path):
        # Add the file and its size to the list
        files.append((file_path, os.path.getsize(file_path)))

print(f"I have {len(files)} files to process!")


I have 1297 files to process!


In [18]:
import shlex

def print_command(curl_command):
    def quote(arg):
        arg = str(arg)
        if any(c in arg for c in ' \t\n"'):
            arg = arg.replace('"', r'\"')
            return f'"{arg}"'
        return arg

    return " ".join(quote(arg) for arg in curl_command)


In [19]:
import xml.etree.ElementTree as ET

def assert_valid_xml(file_path: str) -> None:
    """
    Lança exceção se o arquivo não contiver um XML bem-formado.
    Retorna None se o XML for válido.
    """
    ET.parse(file_path)


In [20]:
import subprocess
import time
import random
import os
from tqdm import tqdm 

# Sort files by size (smallest first)
files.sort(key=lambda x: x[1])
random.shuffle(files)

# Process each file in order of size com tqdm
for file_path, file_size in tqdm(files, desc="Processando arquivos", unit="file"):
    # Skip processing if the file size is zero
    if file_size == 0:
        tqdm.write(f"Skipped {file_path} (file size is zero)")
        continue

    # Extract the filename from the path
    filename = os.path.basename(file_path)

    # Prepare the output filename
    output_filename = f"{filename}.grobid.tei.xml"
    output_file_path = os.path.join(output_tei_path, output_filename)

    # Check if the output file exists and its size is greater than 0
    if not os.path.exists(output_file_path) or os.path.getsize(output_file_path) == 0:
        # Prepare the curl command
        #tqdm.write(f"Trying to extract {file_path} from PDF to TEI...")

        #file_path = truncate_filename(file_path)
            
        curl_command = [
            "curl", # "-v", 
            "-H", "Accept: application/xml", 
            "--form", "consolidateCitations=1", 
            "--form", "includeRawCitations=1", 
            "--form", "segmentSentences=1",
            "--form", "generateIDs=1", 
            "--form", f"input=@{file_path}", 
            f"{grobid_server}/api/processFulltextDocument",
        ]

        curl_command = print_command(curl_command)
        #print(curl_command)
        #print(output_file_path)

        # Open the output file to redirect the curl output into it
        with open(output_file_path, "w") as output_file:
            try:
                # Run the process with a timeout of 10 minutes (600 seconds)
                #result = subprocess.run(f"{curl_command} > {output_file_path}")
                result = subprocess.run(curl_command, stdout=output_file, stderr=subprocess.PIPE, timeout=600)
                if result.stderr:
                    a = 1
                    #tqdm.write("STDERR:", output_file)
                    #tqdm.write(result.stderr.decode())

            except subprocess.TimeoutExpired:
                tqdm.write("The process did not finish within 10 minutes and was terminated.")

        # Check if the output file size is zero
        if os.path.getsize(output_file_path) < 1000:
            tqdm.write(f"{output_file_path} is empty or small. NOT PROCESSED.\n\n")
            time.sleep(5)
        else:
           # tqdm.write(f"Processed succesfully {filename} -> {output_file_path}\n\n")
            assert_valid_xml(output_file_path)
            time.sleep(5)
    else:
        tqdm.write(f"Skipped {filename} (output file already exists and is non-empty)")


                                                                

                                                                

Skipped 10.1002.mde.3133.pdf (output file already exists and is non-empty)
Skipped 10.1515.jeeh-1999-2-307.pdf (output file already exists and is non-empty)
Skipped 10.1080.09538259500000046.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-015-0316-x.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-013-0242-8.pdf (output file already exists and is non-empty)
Skipped 10.1108.978-1-78714-501-620171019.pdf (output file already exists and is non-empty)
Skipped 10.1287.orsc.1110.0727.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-006-6094-8.pdf (output file already exists and is non-empty)
Skipped 10.1080.02691721003749877.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1468-4446.2000.00553.x.pdf (output file already exists and is non-empty)
Skipped 10.1016.S0185-1667(13)72596-7.pdf (output file already exists and is non-empty)
Skipped 10.3828.tpr.76.4.5.pdf (output file already exists and is non-e

Processando arquivos:   1%|▏         | 19/1297 [00:00<00:06, 186.02file/s]

Skipped 10.1007.s10843-007-0011-5.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-018-0425-4.pdf (output file already exists and is non-empty)
Skipped 10.1177.092137409700900105.pdf (output file already exists and is non-empty)
Skipped 10.1016.S0361-3682(99)00029-X.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF02426369.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF00842704.pdf (output file already exists and is non-empty)


Processando arquivos:   1%|▏         | 19/1297 [10:00<00:06, 186.02file/s]

The process did not finish within 10 minutes and was terminated.
..\data\interim\tei\10.1007.978-3-319-71528-5.pdf.grobid.tei.xml is empty or small. NOT PROCESSED.




                                                                            

Skipped 10.1080.08913819008459625.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11293-014-9407-5.pdf (output file already exists and is non-empty)
Skipped 10.1177.053901886025001012.pdf (output file already exists and is non-empty)
Skipped 10.1504.IJEBR.2017.086703.pdf (output file already exists and is non-empty)
Skipped 10.1080.08276331.2020.1764736.pdf (output file already exists and is non-empty)
Skipped 10.1080.08276331.2020.1771812.pdf (output file already exists and is non-empty)
Skipped 10.1108.PIJPSM-08-2016-0135.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1468-0270.2009.01920.x.pdf (output file already exists and is non-empty)
Skipped 10.4337.9781785366642.00049.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-024-00655-1.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF01539332.pdf (output file already exists and is non-empty)
Skipped 10.1257.002205105774431225.pdf (output file already ex

Processando arquivos:   6%|▌         | 81/1297 [10:05<1:29:10,  4.40s/file]

Skipped 10.1111.j.1536-7150.2005.00399.x.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-319-94028-1_12.pdf (output file already exists and is non-empty)
Skipped 10.1016.S0123-5923(10)70117-0.pdf (output file already exists and is non-empty)
Skipped 10.1016.S0263-2373(03)00108-7.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11127-013-0143-1.pdf (output file already exists and is non-empty)
Skipped 10.3390.admsci10040077.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-007-0033-1.pdf (output file already exists and is non-empty)
Skipped 10.1002.soej.12212.pdf (output file already exists and is non-empty)
Skipped 10.1108.JEPP-10-2015-0031.pdf (output file already exists and is non-empty)
Skipped 10.1504.IJEBR.2014.060034.pdf (output file already exists and is non-empty)
Skipped 10.1057.9780230367098.pdf (output file already exists and is non-empty)
Skipped 10.1080.00346768400000020.pdf (output file already exists and is

                                                                           

Skipped 10.1177.030437549201700303.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.ejor.2019.04.027.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-030-22141-6_10.pdf (output file already exists and is non-empty)
Skipped 10.1215.10679847-2793221.pdf (output file already exists and is non-empty)
Skipped 10.1017.S000712340000315X.pdf (output file already exists and is non-empty)
Skipped 10.1515.ajle-2021-0012.pdf (output file already exists and is non-empty)
Skipped 10.1080.09538258900000004.pdf (output file already exists and is non-empty)
Skipped 10.1177.0018726720929397.pdf (output file already exists and is non-empty)
Skipped 10.1007.s40888-023-00301-2.pdf (output file already exists and is non-empty)
Skipped 10.1108.01443588910143964.pdf (output file already exists and is non-empty)
Skipped 10.1117.12.488441.pdf (output file already exists and is non-empty)
Skipped 10.3233.HSM-2011-0758.pdf (output file already exists and is non-empty)
Ski

                                                                          

Skipped 10.1007.0-387-28181-9_27.pdf (output file already exists and is non-empty)
Skipped 10.1080.13669877.2020.1772346.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-030-58926-4_2.pdf (output file already exists and is non-empty)
Skipped 10.1515.jeeh-2000-0211.pdf (output file already exists and is non-empty)
Skipped 10.1111.1467-8551.12253.pdf (output file already exists and is non-empty)
Skipped 10.1080.08913811.2016.1264158.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-016-0374-8.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-008-0046-4.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.scaman.2008.03.001.pdf (output file already exists and is non-empty)
Skipped 10.1177.2515127419829394.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-011-0170-4.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-009-0070-z.pdf (output file already exists and 

Processando arquivos:  12%|█▏        | 157/1297 [10:06<15:58,  1.19file/s]

Skipped 10.1007.s11138-011-0140-x.pdf (output file already exists and is non-empty)
Skipped 10.1007.s12115-010-9305-7.pdf (output file already exists and is non-empty)
Skipped 10.1017.9781316480748.022.pdf (output file already exists and is non-empty)
Skipped 10.2308.acch-10311.pdf (output file already exists and is non-empty)
Skipped 10.1080.00472778.2021.1945071.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1475-4932.1982.tb00361.x.pdf (output file already exists and is non-empty)
Skipped 10.1080.14697017.2013.841006.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-007-0039-8.pdf (output file already exists and is non-empty)
Skipped 10.1080.00128775.2002.11041012.pdf (output file already exists and is non-empty)
Skipped 10.1177.2047173421994333.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-010-0138-9.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1536-7150.1998.tb03475.x.pdf (output file alr

Processando arquivos:  15%|█▌        | 195/1297 [10:06<07:25,  2.47file/s]

Skipped 10.1515.jeeh-2016-0008.pdf (output file already exists and is non-empty)
Skipped 10.1515.erj-2020-0519.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11229-020-02730-z.pdf (output file already exists and is non-empty)
Skipped 10.1016.0016-3287(75)90098-1.pdf (output file already exists and is non-empty)
Skipped 10.1002.9781118324950.ch15.pdf (output file already exists and is non-empty)
Skipped 10.1093.cje.27.1.97.pdf (output file already exists and is non-empty)
Skipped 10.1080.09538258900000021.pdf (output file already exists and is non-empty)
Skipped 10.1108.eb006092.pdf (output file already exists and is non-empty)
Skipped 10.1017.S0265052517000255.pdf (output file already exists and is non-empty)
Skipped 10.1080.08913811.2012.684478.pdf (output file already exists and is non-empty)
Skipped 10.2202.1145-6396.1182.pdf (output file already exists and is non-empty)
Skipped 10.1080.09538259.2016.1108131.pdf (output file already exists and is non-empty)
Skipp

                                                                          

Skipped 10.1057.9781137287700.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-319-56261-2_6.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.emj.2008.06.002.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-319-39654-5_2.pdf (output file already exists and is non-empty)
Skipped 10.1353.rhm.2019.0015.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF00709143.pdf (output file already exists and is non-empty)
Skipped 10.4337.9781782548362.pdf (output file already exists and is non-empty)
Skipped 10.1515.jeeh-1999-0401.pdf (output file already exists and is non-empty)
Skipped 10.1017.S1053837218000767.pdf (output file already exists and is non-empty)
Skipped 10.1002.9781119199199.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11127-019-00720-5.pdf (output file already exists and is non-empty)
Skipped 10.1111.1536-7150.00135.pdf (output file already exists and is non-empty)
Skipped 10.1007

                                                                          

Skipped 10.1353.ff.2021.0010.pdf (output file already exists and is non-empty)
Skipped 10.1177.092137409200500308.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-011-0143-7.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.jemep.2016.05.006.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1536-7150.2007.00550.x.pdf (output file already exists and is non-empty)
Skipped 10.1016.0090-5720(82)90016-X.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.jbusvent.2003.09.003.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-030-05557-8.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-020-00521-w.pdf (output file already exists and is non-empty)
Skipped 10.1007.s10273-021-3000-8.pdf (output file already exists and is non-empty)
Skipped 10.1108.17506200810897196.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-021-00551-y.pdf (output file already ex

Processando arquivos:  22%|██▏       | 287/1297 [10:06<01:14, 13.60file/s]

Skipped 10.1515.jeeh-1996-2-315.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.jebo.2021.07.005.pdf (output file already exists and is non-empty)
Skipped 10.1080.07360932.2022.2052737.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.jce.2003.08.005.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.jbef.2015.02.006.pdf (output file already exists and is non-empty)
Skipped 10.1002.mde.4090060313.pdf (output file already exists and is non-empty)
Skipped 10.1142.9789814273374_0010.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1536-7150.2012.00862.x.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-022-00589-6.pdf (output file already exists and is non-empty)
Skipped 10.1057.s41272-018-00169-z.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-1-137-53556-6.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-014-0298-0.pdf (output file already exists

Processando arquivos:  22%|██▏       | 287/1297 [10:06<01:14, 13.60file/s]

Skipped 10.1080.03071029308567871.pdf (output file already exists and is non-empty)
Skipped 10.2753.EEE0012-8775490203.pdf (output file already exists and is non-empty)
Skipped 10.1111.twec.12659.pdf (output file already exists and is non-empty)
Skipped 10.1111.1467-9248.00323.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-019-00492-7.pdf (output file already exists and is non-empty)


Processando arquivos:  25%|██▌       | 325/1297 [17:02<1:16:02,  4.69s/file]

Skipped 10.1016.j.technovation.2017.06.001.pdf (output file already exists and is non-empty)
Skipped 10.1007.s10100-017-0500-0.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11127-024-01199-5.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-319-76288-3_13.pdf (output file already exists and is non-empty)
Skipped 10.1007.s10551-019-04171-2.pdf (output file already exists and is non-empty)
Skipped 10.1093.cje.beab001.pdf (output file already exists and is non-empty)
Skipped 10.1002.mde.4090040406.pdf (output file already exists and is non-empty)
Skipped 10.1108.00400911111147758.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1536-7150.1996.tb02712.x.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.ruje.2016.04.005.pdf (output file already exists and is non-empty)
Skipped 10.1007.s10490-008-9110-7.pdf (output file already exists and is non-empty)
Skipped 10.1177.0170840607072546.pdf (output file already exi

Processando arquivos:  28%|██▊       | 360/1297 [17:02<37:18,  2.39s/file]  

Skipped 10.1108.S1529-213420180000023007.pdf (output file already exists and is non-empty)
Skipped 10.1177.004057360606300202.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF01539562.pdf (output file already exists and is non-empty)
Skipped 10.1080.00213624.2018.1430943.pdf (output file already exists and is non-empty)
Skipped 10.1108.S1529-213420150000019009.pdf (output file already exists and is non-empty)
Skipped 10.5465.amp.2017.0181.pdf (output file already exists and is non-empty)
Skipped 10.1080.00346768300000004.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-008-0049-1.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.forpol.2016.06.015.pdf (output file already exists and is non-empty)
Skipped ..\data\raw\pdf\from-script\10.1061.(ASCE)1052-3928(1989)115 (file size is zero)
Skipped 10.1016.0361-3682(87)90010-9.pdf (output file already exists and is non-empty)
Skipped 10.1007.s10551-015-2602-8.pdf (output file alread

                                                                          

Skipped 10.1007.BF00842703.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-016-0345-0.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-030-48667-9.pdf (output file already exists and is non-empty)
Skipped 10.1016.0090-5720(82)90032-8.pdf (output file already exists and is non-empty)
Skipped 10.1080.00076791.2011.599593.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.socec.2012.08.001.pdf (output file already exists and is non-empty)
Skipped 10.1007.s12113-008-9034-6.pdf (output file already exists and is non-empty)
Skipped 10.1007.s001910050013.pdf (output file already exists and is non-empty)
Skipped 10.1108.S1529-213420160000020004.pdf (output file already exists and is non-empty)
Skipped 10.1108.03068290910921154.pdf (output file already exists and is non-empty)
Skipped 10.1016.S1529-2134(05)08001-4.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.qref.2013.05.010.pdf (output file already exists a

Processando arquivos:  31%|███       | 398/1297 [17:39<19:35,  1.31s/file]

Skipped 10.1007.BF02685380.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11127-019-00657-9.pdf (output file already exists and is non-empty)
Skipped 10.3917.redp.271.0119.pdf (output file already exists and is non-empty)
Skipped 10.1109.PTC.2005.4524747.pdf (output file already exists and is non-empty)
Skipped 10.23919.PICMET.2018.8481743.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-013-0240-x.pdf (output file already exists and is non-empty)
Skipped 10.1504.IJEV.2020.107930.pdf (output file already exists and is non-empty)


Processando arquivos:  31%|███       | 398/1297 [27:39<19:35,  1.31s/file]

The process did not finish within 10 minutes and was terminated.
..\data\interim\tei\10.1007.978-3-030-62962-5_3.pdf.grobid.tei.xml is empty or small. NOT PROCESSED.




Processando arquivos:  31%|███       | 402/1297 [27:44<4:02:53, 16.28s/file]

Skipped 10.4324.9781315298955.pdf (output file already exists and is non-empty)
Skipped 10.1108.00251741211246987.pdf (output file already exists and is non-empty)
Skipped 10.1111.twec.13028.pdf (output file already exists and is non-empty)
Skipped 10.2202.1145-6396.1047.pdf (output file already exists and is non-empty)
Skipped 10.1080.01969728908902188.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.jbusvent.2010.12.001.pdf (output file already exists and is non-empty)
Skipped 10.2202.1145-6396.1152.pdf (output file already exists and is non-empty)
Skipped 10.1080.08935696.2014.980674.pdf (output file already exists and is non-empty)
Skipped 10.1109.ITMC.2011.5995994.pdf (output file already exists and is non-empty)
Skipped 10.1002.asi.23526.pdf (output file already exists and is non-empty)
Skipped 10.1515.jeeh-2012-0006.pdf (output file already exists and is non-empty)
Skipped 10.1007.bf01101890.pdf (output file already exists and is non-empty)
Skipped 10.1002.mde

Processando arquivos:  31%|███       | 402/1297 [37:44<4:02:53, 16.28s/file]

The process did not finish within 10 minutes and was terminated.
..\data\interim\tei\10.4324.9781315212500.pdf.grobid.tei.xml is empty or small. NOT PROCESSED.




                                                                            

Skipped 10.1016.j.jwb.2007.11.013.pdf (output file already exists and is non-empty)
Skipped 10.1215.00182702-28-2-219.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1536-7150.2008.00569.x.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-011-0152-6.pdf (output file already exists and is non-empty)
Skipped 10.1080.135017800453751.pdf (output file already exists and is non-empty)
Skipped 10.1093.oxrep.grv015.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-019-00461-0.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-030-66053-6.pdf (output file already exists and is non-empty)
Skipped 10.1108.03068290610689723.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1467-6435.1996.tb01398.x.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-019-00445-0.pdf (output file already exists and is non-empty)
Skipped 10.1108.14636680610712496.pdf (output file already exists

Processando arquivos:  36%|███▌      | 467/1297 [37:49<1:38:34,  7.13s/file]

Skipped 10.1080.09502381003750278.pdf (output file already exists and is non-empty)
Skipped 10.1108.NBRI-11-2015-0027.pdf (output file already exists and is non-empty)
Skipped 10.1109.ICMIT.2008.4654356.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-011-0167-z.pdf (output file already exists and is non-empty)
Skipped 10.1515.jbvela-2016-0013.pdf (output file already exists and is non-empty)
Skipped 10.1080.09538259200000016.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1752-1688.1992.tb04018.x.pdf (output file already exists and is non-empty)
Skipped 10.1017.S0266267100003710.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-319-73528-3_9.pdf (output file already exists and is non-empty)
Skipped 10.1093.oxrep.gry020.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1468-0270.2009.01900.x.pdf (output file already exists and is non-empty)
Skipped 10.1163.9789004305311.pdf (output file already exists a

Processando arquivos:  39%|███▊      | 501/1297 [37:49<44:11,  3.33s/file]  

Skipped 10.1007.BF00140290.pdf (output file already exists and is non-empty)
Skipped 10.1177.1470593114534342.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-012-0193-5.pdf (output file already exists and is non-empty)
Skipped 10.2307.41560363.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-005-5592-4.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-94-017-8675-1.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-011-0141-9.pdf (output file already exists and is non-empty)
Skipped 10.1080.00336297.2014.984735.pdf (output file already exists and is non-empty)
Skipped 10.1007.s10602-016-9222-x.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-013-0233-9.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF01539569.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-030-78702-8.pdf (output file already exists and is non-empty)
Skipped 10.10

Processando arquivos:  40%|███▉      | 517/1297 [37:50<30:23,  2.34s/file]

Skipped 10.1016.j.langsci.2018.04.006.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-011-0162-4.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-016-0350-3.pdf (output file already exists and is non-empty)
Skipped 10.1300.J140v06n04_02.pdf (output file already exists and is non-empty)
Skipped 10.1016.S1053-5357(02)00140-3.pdf (output file already exists and is non-empty)
Skipped 10.1002.mde.4090050304.pdf (output file already exists and is non-empty)
Skipped 10.1108.S1529-213420150000019007.pdf (output file already exists and is non-empty)
Skipped 10.1080.00220485.2012.636713.pdf (output file already exists and is non-empty)
Skipped 10.1007.s10818-019-09285-1.pdf (output file already exists and is non-empty)
Skipped 10.1504.IJESB.2011.039007.pdf (output file already exists and is non-empty)
Skipped 10.1002.9781119201779.pdf (output file already exists and is non-empty)
Skipped 10.1108.S1529-213420150000019017.pdf (output file already ex

Processando arquivos:  43%|████▎     | 555/1297 [37:50<12:59,  1.05s/file]

Skipped 10.1007.s11138-019-00444-1.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF02145244.pdf (output file already exists and is non-empty)
Skipped 10.1504.IJPEE.2019.101727.pdf (output file already exists and is non-empty)
Skipped 10.1108.IJEBR-05-2017-0180.pdf (output file already exists and is non-empty)
Skipped 10.1177.053901800039004002.pdf (output file already exists and is non-empty)
Skipped 10.1177.0973801018768989.pdf (output file already exists and is non-empty)
Skipped 10.1080.00913367.1986.10673004.pdf (output file already exists and is non-empty)
Skipped 10.1007.s10602-022-09365-x.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1536-7150.2008.00592.x.pdf (output file already exists and is non-empty)
Skipped 10.1017.S0034670500032150.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-021-00548-7.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-8349-3722-3_17.pdf (output file already exist

Processando arquivos:  45%|████▌     | 589/1297 [37:50<06:13,  1.90file/s]

Skipped 10.2307.41607996.pdf (output file already exists and is non-empty)
Skipped 10.1080.1350178X.2011.575949.pdf (output file already exists and is non-empty)
Skipped 10.1080.03031853.1990.9525107.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-016-0356-x.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11127-006-9046-8.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF01049234.pdf (output file already exists and is non-empty)
Skipped 10.1017.s026505250000193x.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF02426927.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-319-28134-6_13.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1536-7150.2010.00742.x.pdf (output file already exists and is non-empty)
Skipped 10.1515.jeeh-2013-0012.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-019-00485-6.pdf (output file already exists and is non-empty)


Processando arquivos:  46%|████▌     | 593/1297 [39:52<38:01,  3.24s/file]

Skipped 10.1007.s11138-007-0034-0.pdf (output file already exists and is non-empty)
Skipped 10.1086.676068.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1536-7150.2012.00859.x.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF01102289.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF02693321.pdf (output file already exists and is non-empty)
Skipped 10.3200.MONO.131.4.285-358.pdf (output file already exists and is non-empty)
Skipped 10.1561.0300000018.pdf (output file already exists and is non-empty)
Skipped 10.3390.su10082813.pdf (output file already exists and is non-empty)
Skipped 10.1177.1468795X09105446.pdf (output file already exists and is non-empty)
Skipped 10.1007.s10843-011-0081-2.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1540-6520.2008.00255.x.pdf (output file already exists and is non-empty)
Skipped 10.4337.9781849809634.pdf (output file already exists and is non-empty)
Skipped 10.1057.97811

Processando arquivos:  46%|████▌     | 593/1297 [49:52<38:01,  3.24s/file]

The process did not finish within 10 minutes and was terminated.
..\data\interim\tei\10.1111.beer.12037.pdf.grobid.tei.xml is empty or small. NOT PROCESSED.




                                                                            

Skipped 10.1108.S1048-473620150000025006.pdf (output file already exists and is non-empty)
Skipped 10.1109.PERCOM.2007.33.pdf (output file already exists and is non-empty)
Skipped 10.1016.0281-7527(87)90007-7.pdf (output file already exists and is non-empty)
Skipped 10.1007.s12122-997-1036-1.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-319-26692-3_1.pdf (output file already exists and is non-empty)
Skipped 10.1108.S1529-213420150000019011.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1533-8525.2008.00113.x.pdf (output file already exists and is non-empty)
Skipped 10.1007.s10551-009-0263-1.pdf (output file already exists and is non-empty)
Skipped 10.1080.09672567.2022.2123529.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1744-7976.2009.01165.x.pdf (output file already exists and is non-empty)
Skipped 10.1080.08913818708459503.pdf (output file already exists and is non-empty)
Skipped 10.1515.jbvela-2016-0018.pdf (outp

Processando arquivos:  51%|█████     | 656/1297 [49:58<49:47,  4.66s/file]  

Skipped 10.1080.08913819408443346.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-319-06215-0_2.pdf (output file already exists and is non-empty)
Skipped 10.1007.s10818-005-7606-5.pdf (output file already exists and is non-empty)
Skipped 10.1007.s40926-017-0054-1.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-014-0256-x.pdf (output file already exists and is non-empty)
Skipped 10.1177.0276146700201007.pdf (output file already exists and is non-empty)
Skipped 10.1108.S0743-41542016000034B012.pdf (output file already exists and is non-empty)
Skipped 10.1177.1473095205051437.pdf (output file already exists and is non-empty)
Skipped 10.1017.S1744137414000587.pdf (output file already exists and is non-empty)
Skipped 10.1093.oxfordhb.9780190469733.013.36.pdf (output file already exists and is non-empty)
Skipped 10.1177.0266242611425838.pdf (output file already exists and is non-empty)
Skipped 10.1080.19422539.2018.1561134.pdf (output file alr

Processando arquivos:  53%|█████▎    | 692/1297 [49:58<21:09,  2.10s/file]

Skipped 10.1007.s11138-014-0277-5.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.intell.2010.06.002.pdf (output file already exists and is non-empty)
Skipped 10.1093.oxfordjournals.cje.a035181.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-319-18848-5.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1536-7150.2005.00422.x.pdf (output file already exists and is non-empty)
Skipped 10.1590.198055272221.pdf (output file already exists and is non-empty)
Skipped 10.1515.jeeh-1999-2-311.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-019-00472-x.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1813-6982.1991.tb00966.x.pdf (output file already exists and is non-empty)
Skipped 10.2202.1935-1682.1950.pdf (output file already exists and is non-empty)
Skipped 10.1108.03068299510764235.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-022-00574-z.pdf (output file alrea

Processando arquivos:  55%|█████▍    | 711/1297 [49:58<13:52,  1.42s/file]

Skipped 10.2202.1145-6396.1146.pdf (output file already exists and is non-empty)
Skipped 10.1111.1467-9485.00044.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1467-6435.1988.tb02731.x.pdf (output file already exists and is non-empty)
Skipped 10.1177.0090591718807151.pdf (output file already exists and is non-empty)
Skipped 10.1108.S1529-213420160000020006.pdf (output file already exists and is non-empty)
Skipped 10.1017.S0012217300012981.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.ibusrev.2004.04.009.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-017-0402-3.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-021-00556-7.pdf (output file already exists and is non-empty)
Skipped 10.1111.beer.12441.pdf (output file already exists and is non-empty)
Skipped 10.1515.jeeh-1999-2-305.pdf (output file already exists and is non-empty)
Skipped 10.1080.10427710500370273.pdf (output file already exists and 

                                                                          

Skipped 10.1108.IJEBR-01-2012-0015.pdf (output file already exists and is non-empty)
Skipped 10.1080.030851400360460.pdf (output file already exists and is non-empty)
Skipped 10.1002.9781444315578.ch13.pdf (output file already exists and is non-empty)
Skipped 10.1016.0164-0704(84)90005-3.pdf (output file already exists and is non-empty)
Skipped 10.1177.0048393120917757.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF02958735.pdf (output file already exists and is non-empty)
Skipped 10.1108.13522750410540182.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1467-9485.1984.tb00464.x.pdf (output file already exists and is non-empty)
Skipped 10.1016.1061-7361(95)90024-1.pdf (output file already exists and is non-empty)
Skipped 10.1108.S1529-2134(2011)0000015017.pdf (output file already exists and is non-empty)
Skipped 10.1002.sej.1311.pdf (output file already exists and is non-empty)
Skipped 10.1080.00131857.2020.1767073.pdf (output file already exists

                                                                          

Skipped 10.1007.s11138-011-0151-7.pdf (output file already exists and is non-empty)
Skipped 10.1108.JEPP-04-2020-0018.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1536-7150.2009.00692.x.pdf (output file already exists and is non-empty)
Skipped 10.1093.cesifo.ifu009.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF02393049.pdf (output file already exists and is non-empty)
Skipped 10.1142.9789812792426_0025.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1813-6982.1992.tb00219.x.pdf (output file already exists and is non-empty)
Skipped 10.1515.erj-2015-0042.pdf (output file already exists and is non-empty)
Skipped 10.1093.cje.bep028.pdf (output file already exists and is non-empty)
Skipped 10.1007.s10602-014-9152-4.pdf (output file already exists and is non-empty)
Skipped 10.1080.08913819008459590.pdf (output file already exists and is non-empty)
Skipped 10.1080.00128775.2004.11041066.pdf (output file already exists and is non

                                                                          

Skipped 10.1007.BF01303407.pdf (output file already exists and is non-empty)
Skipped 10.1177.092137409700900209.pdf (output file already exists and is non-empty)
Skipped 10.1108.03068290010306435.pdf (output file already exists and is non-empty)
Skipped 10.1515.jeeh-2000-0104.pdf (output file already exists and is non-empty)
Skipped 10.1108.S0743-41542019000037B021.pdf (output file already exists and is non-empty)
Skipped 10.1108.03090560110388097.pdf (output file already exists and is non-empty)
Skipped 10.1016.S1389-1286(00)00142-0.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11187-023-00746-6.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-009-0093-5.pdf (output file already exists and is non-empty)
Skipped 10.1080.08039410.2003.9666244.pdf (output file already exists and is non-empty)
Skipped 10.1108.03068299410049528.pdf (output file already exists and is non-empty)
Skipped 10.1007.b137529.pdf (output file already exists and is non-em

Processando arquivos:  62%|██████▏   | 809/1297 [49:59<01:27,  5.59file/s]

Skipped 10.1108.09513559310023590.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1467-923X.2006.00821.x.pdf (output file already exists and is non-empty)
Skipped 10.1515.zfw-2017-0040.pdf (output file already exists and is non-empty)
Skipped 10.1108.00251740210452791.pdf (output file already exists and is non-empty)
Skipped 10.1057.9781137353511.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1536-7150.2007.00520.x.pdf (output file already exists and is non-empty)
Skipped 10.4018.jisss.2009070101.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-018-0427-2.pdf (output file already exists and is non-empty)
Skipped 10.1007.s12113-008-9043-5.pdf (output file already exists and is non-empty)
Skipped 10.1080.09538250802170228.pdf (output file already exists and is non-empty)
Skipped 10.1007.s00191-020-00715-2.pdf (output file already exists and is non-empty)
Skipped 10.1108.00251740210437734.pdf (output file already exists and 

Processando arquivos:  64%|██████▎   | 825/1297 [49:59<01:01,  7.70file/s]

Skipped 10.1177.0266242614551185.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11293-022-09740-x.pdf (output file already exists and is non-empty)
Skipped 10.1002.9781118867938.pdf (output file already exists and is non-empty)
Skipped 10.1002.9781119200918.pdf (output file already exists and is non-empty)
Skipped 10.1108.S1529-213420190000024009.pdf (output file already exists and is non-empty)
Skipped 10.1007.s10551-014-2164-1.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1465-7287.1988.tb00278.x.pdf (output file already exists and is non-empty)
Skipped 10.1016.B978-0-08-097086-8.71072-5.pdf (output file already exists and is non-empty)
Skipped 10.5465.AMR.2018.0285.pdf (output file already exists and is non-empty)
Skipped 10.1016.s1048-4736(01)13010-9.pdf (output file already exists and is non-empty)
Skipped 10.1093.ser.mwy049.pdf (output file already exists and is non-empty)
Skipped 10.1515.peps-2015-0001.pdf (output file already exists and

                                                                            

Skipped 10.1007.s11138-013-0226-8.pdf (output file already exists and is non-empty)
Skipped 10.5465.AMR.2020.0120.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-005-6827-0.pdf (output file already exists and is non-empty)
Skipped 10.1068.c0832.pdf (output file already exists and is non-empty)
Skipped 10.1108.01443330910975669.pdf (output file already exists and is non-empty)
Skipped 10.1123.jsm.2012-0159.pdf (output file already exists and is non-empty)
Skipped 10.1080.09538259200000005.pdf (output file already exists and is non-empty)
Skipped 10.1007.s12115-018-00322-9.pdf (output file already exists and is non-empty)
Skipped 10.1109.ICIF.2003.177494.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-019-00450-3.pdf (output file already exists and is non-empty)
Skipped 10.1108.JEPP-03-2016-0010.pdf (output file already exists and is non-empty)
Skipped 10.1108.S1529-213420160000020009.pdf (output file already exists and is non-empty)
Skip

                                                                          

Skipped 10.1504.IJEV.2018.094627.pdf (output file already exists and is non-empty)
Skipped 10.1177.0961463X04045672.pdf (output file already exists and is non-empty)
Skipped 10.1007.s12113-008-9036-4.pdf (output file already exists and is non-empty)
Skipped 10.1007.s10657-018-9607-6.pdf (output file already exists and is non-empty)
Skipped 10.1177.10564926211031290.pdf (output file already exists and is non-empty)
Skipped 10.1108.JEPP-12-2015-0037.pdf (output file already exists and is non-empty)
Skipped 10.1108.eb002673.pdf (output file already exists and is non-empty)
Skipped 10.1080.13569317.2013.750182.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF01102316.pdf (output file already exists and is non-empty)
Skipped 10.1108.S1529-2134(2011)0000015009.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.jbvi.2018.e00095.pdf (output file already exists and is non-empty)
Skipped 10.1504.IJPLAP.2011.043858.pdf (output file already exists and is non-emp

Processando arquivos:  71%|███████   | 924/1297 [57:23<08:04,  1.30s/file]

Skipped 10.1007.978-3-030-44465-5.pdf (output file already exists and is non-empty)
Skipped 10.1177.092137409700900104.pdf (output file already exists and is non-empty)
Skipped 10.5465.amp.2015.0135.pdf (output file already exists and is non-empty)
Skipped 10.1007.s43253-020-00018-z.pdf (output file already exists and is non-empty)
Skipped 10.1057.emr.2010.1.pdf (output file already exists and is non-empty)
Skipped 10.1108.13552559610110727.pdf (output file already exists and is non-empty)
Skipped 10.2307.2010178.pdf (output file already exists and is non-empty)
Skipped 10.1108.eb006106.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF01539555.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-319-75817-6_3.pdf (output file already exists and is non-empty)
Skipped 10.1007.bf01103331.pdf (output file already exists and is non-empty)
Skipped 10.1515.jeeh-2016-0013.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-030-22121-8_1

Processando arquivos:  74%|███████▍  | 962/1297 [57:23<03:25,  1.63file/s]

Skipped 10.1080.19420676.2011.606331.pdf (output file already exists and is non-empty)
Skipped 10.1111.ecaf.12174.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF02357143.pdf (output file already exists and is non-empty)
Skipped 10.1111.rode.12170.pdf (output file already exists and is non-empty)
Skipped 10.1108.MABR-05-2016-0008.pdf (output file already exists and is non-empty)
Skipped 10.1177.0073275314529860.pdf (output file already exists and is non-empty)
Skipped 10.1162.10636140160176170.pdf (output file already exists and is non-empty)
Skipped 10.1108.S1529-213420150000019004.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-009-0087-3.pdf (output file already exists and is non-empty)
Skipped 10.5325.jaynrandstud.15.2.0131.pdf (output file already exists and is non-empty)
Skipped 10.1108.13552559910300372.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1468-0270.2010.02047.x.pdf (output file already exists and is non

Processando arquivos:  76%|███████▌  | 981/1297 [57:23<02:14,  2.35file/s]

Skipped 10.1016.j.jrurstud.2020.02.005.pdf (output file already exists and is non-empty)
Skipped 10.1068.a36306.pdf (output file already exists and is non-empty)
Skipped 10.1108.S1529-2134201519.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-016-0347-y.pdf (output file already exists and is non-empty)
Skipped 10.1353.ajp.2007.0021.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.chieco.2021.101602.pdf (output file already exists and is non-empty)
Skipped 10.2202.1145-6396.1044.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1536-7150.2004.00307.x.pdf (output file already exists and is non-empty)
Skipped 10.1108.MIP-08-2018-0306.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1752-1688.1989.tb05419.x.pdf (output file already exists and is non-empty)
Skipped 10.1017.S1053837211000198.pdf (output file already exists and is non-empty)
Skipped 10.1108.01443589010136951.pdf (output file already exists and is

                                                                           

Skipped 10.4337.9780857931733.00005.pdf (output file already exists and is non-empty)
Skipped ..\data\raw\pdf\from-script\10.1061.(ASCE)1052-3928(1993)119 (file size is zero)
Skipped 10.1007.s11138-017-0388-x.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-012-0180-x.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.jbusres.2013.03.017.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-319-06474-1_1.pdf (output file already exists and is non-empty)
Skipped 10.2307.1061278.pdf (output file already exists and is non-empty)
Skipped 10.1108.IJDI-03-2017-0032.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-319-08515-9_11.pdf (output file already exists and is non-empty)
Skipped 10.1017.S1744137414000630.pdf (output file already exists and is non-empty)
Skipped 10.1285.i20356609v9i2p339.pdf (output file already exists and is non-empty)
Skipped 10.1093.icc.9.4.659.pdf (output file already exists and is non

                                                                           

Skipped 10.1111.j.1536-7150.1998.tb03380.x.pdf (output file already exists and is non-empty)
Skipped 10.1057.s41296-023-00657-x.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-020-00519-4.pdf (output file already exists and is non-empty)
Skipped 10.1108.JEPP-D-18-00042.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11293-016-9513-7.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-006-7342-7.pdf (output file already exists and is non-empty)
Skipped 10.1111.beer.12025.pdf (output file already exists and is non-empty)
Skipped 10.1080.00220485.1985.10845117.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.inffus.2007.01.004.pdf (output file already exists and is non-empty)
Skipped 10.1016.0002-9378(81)90106-X.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF01101941.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-005-6826-1.pdf (output file already exists and 

Processando arquivos:  82%|████████▏ | 1058/1297 [57:24<00:24,  9.76file/s]

Skipped 10.1080.07360932.2015.1042491.pdf (output file already exists and is non-empty)
Skipped 10.1080.00346764.2019.1685676.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1752-1688.1988.tb00894.x.pdf (output file already exists and is non-empty)
Skipped 10.1002.smj.726.pdf (output file already exists and is non-empty)
Skipped 10.4324.9780203076514.pdf (output file already exists and is non-empty)
Skipped 10.1111.coep.12453.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF02426363.pdf (output file already exists and is non-empty)
Skipped 10.4337.9781784718237.00007.pdf (output file already exists and is non-empty)
Skipped 10.4324.9781315754581.pdf (output file already exists and is non-empty)


Processando arquivos:  85%|████████▍ | 1101/1297 [1:02:01<08:11,  2.51s/file]

Skipped 10.1017.S1744137414000381.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-319-05062-1.pdf (output file already exists and is non-empty)
Skipped 10.1111.polp.12330.pdf (output file already exists and is non-empty)
Skipped 10.1561.1400000011.pdf (output file already exists and is non-empty)
Skipped 10.1108.JEPP-10-2015-0030.pdf (output file already exists and is non-empty)
Skipped 10.1177.1474885120960439.pdf (output file already exists and is non-empty)
Skipped 10.1108.JAOC-12-2016-0083.pdf (output file already exists and is non-empty)
Skipped 10.1177.092137409700900208.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-014-0273-9.pdf (output file already exists and is non-empty)
Skipped 10.1080.1366271032000163702.pdf (output file already exists and is non-empty)
Skipped 10.1108.JEPP-03-2019-0012.pdf (output file already exists and is non-empty)
Skipped 10.5465.amr.2016.0046.pdf (output file already exists and is non-empty)
Skipped 

Processando arquivos:  85%|████████▍ | 1101/1297 [1:12:01<08:11,  2.51s/file]

The process did not finish within 10 minutes and was terminated.
..\data\interim\tei\10.1007.978-3-319-75328-7_13.pdf.grobid.tei.xml is empty or small. NOT PROCESSED.




Processando arquivos:  87%|████████▋ | 1123/1297 [1:12:06<29:25, 10.15s/file]

Skipped 10.1007.978-981-19-5470-2.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.physa.2010.02.006.pdf (output file already exists and is non-empty)
Skipped 10.1080.14747731.2020.1791380.pdf (output file already exists and is non-empty)
Skipped 10.1080.1350178X.2021.1926528.pdf (output file already exists and is non-empty)
Skipped 10.1515.auk-2018-0006.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-319-55926-1_4.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1536-7150.2009.00675.x.pdf (output file already exists and is non-empty)
Skipped 10.1080.00201749008602211.pdf (output file already exists and is non-empty)
Skipped 10.1177.092137409600800304.pdf (output file already exists and is non-empty)
Skipped 10.1108.S1529-2134(2012)0000016004.pdf (output file already exists and is non-empty)
Skipped 10.1108.17506221211282000.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-012-0174-8.pdf (output file 

                                                                             

Skipped 10.1007.s11138-008-0043-7.pdf (output file already exists and is non-empty)
Skipped 10.3390.su13116156.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1536-7150.2007.00559.x.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-014-0274-8.pdf (output file already exists and is non-empty)
Skipped 10.1524.jbwg.2011.0009.pdf (output file already exists and is non-empty)
Skipped 10.1177.092137409200500310.pdf (output file already exists and is non-empty)
Skipped 10.1007.s12232-007-0030-5.pdf (output file already exists and is non-empty)
Skipped 10.1108.eb002567.pdf (output file already exists and is non-empty)
Skipped 10.1080.14759551.2020.1861451.pdf (output file already exists and is non-empty)
Skipped 10.4324.9781315764818-21.pdf (output file already exists and is non-empty)
Skipped 10.5901.mjss.2015.v6n3p646.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-319-75817-6_2.pdf (output file already exists and is non-empt

Processando arquivos:  91%|█████████ | 1177/1297 [1:12:07<06:01,  3.01s/file]

Skipped 10.1007.PL00003857.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.jbusvent.2021.106159.pdf (output file already exists and is non-empty)
Skipped 10.5305.amerjintelaw.108.4.0650.pdf (output file already exists and is non-empty)
Skipped 10.1515.jeeh-1998-0105.pdf (output file already exists and is non-empty)
Skipped 10.1080.10427710600857856.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-009-0096-2.pdf (output file already exists and is non-empty)
Skipped 10.1215.00182702-2884357.pdf (output file already exists and is non-empty)
Skipped 10.1057.lst.2015.56.pdf (output file already exists and is non-empty)
Skipped 10.5465.AMR.2018.0198.pdf (output file already exists and is non-empty)
Skipped 10.1080.1350178X.2017.1356439.pdf (output file already exists and is non-empty)
Skipped 10.1080.09585192.2012.665068.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-020-00535-4.pdf (output file already exists and is non-e

Processando arquivos:  93%|█████████▎| 1211/1297 [1:12:07<02:05,  1.46s/file]

Skipped 10.1108.S1529-2134(2011)0000015018.pdf (output file already exists and is non-empty)
Skipped 10.1016.S0749-6826(07)11002-7.pdf (output file already exists and is non-empty)
Skipped 10.2202.1145-6396.1218.pdf (output file already exists and is non-empty)
Skipped 10.1111.ecaf.12088.pdf (output file already exists and is non-empty)
Skipped 10.1515.jbvela-2016-0017.pdf (output file already exists and is non-empty)
Skipped 10.1017.S1053837214000777.pdf (output file already exists and is non-empty)
Skipped 10.1080.02604027.1990.9972197.pdf (output file already exists and is non-empty)
Skipped 10.1108.JEAS-08-2015-0028.pdf (output file already exists and is non-empty)
Skipped 10.1007.s12113-008-9047-1.pdf (output file already exists and is non-empty)
Skipped 10.1108.JOCM-04-2019-0107.pdf (output file already exists and is non-empty)
Skipped 10.1016.0305-750X(94)90177-5.pdf (output file already exists and is non-empty)
Skipped 10.1017.S1062798714000283.pdf (output file already exists a

Processando arquivos:  96%|█████████▌| 1246/1297 [1:12:30<00:49,  1.03file/s]

Skipped 10.1111.j.1467-6281.1987.tb00135.x.pdf (output file already exists and is non-empty)
Skipped 10.1080.23801883.2019.1588198.pdf (output file already exists and is non-empty)
Skipped 10.1007.BF00752436.pdf (output file already exists and is non-empty)
Skipped 10.1007.s00199-009-0454-0.pdf (output file already exists and is non-empty)
Skipped 10.1007.978-3-319-47828-9_11.pdf (output file already exists and is non-empty)
Skipped 10.1108.JEPP-03-2019-0017.pdf (output file already exists and is non-empty)
Skipped 10.1007.s10516-017-9352-4.pdf (output file already exists and is non-empty)
Skipped 10.1108.JIMA-02-2013-0013.pdf (output file already exists and is non-empty)
Skipped 10.1080.21639159.2014.911494.pdf (output file already exists and is non-empty)
Skipped 10.1287.orsc.2018.1225.pdf (output file already exists and is non-empty)
Skipped 10.1515.jeeh-2014-0018.pdf (output file already exists and is non-empty)
Skipped 10.1093.cje.bes091.pdf (output file already exists and is non-

Processando arquivos:  99%|█████████▉| 1284/1297 [1:12:30<00:05,  2.18file/s]

Skipped 10.1177.1035304615599870.pdf (output file already exists and is non-empty)
Skipped 10.1111.ecaf.12490.pdf (output file already exists and is non-empty)
Skipped 10.2202.1145-6396.1241.pdf (output file already exists and is non-empty)
Skipped 10.1002.9781444307054.ch4.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-006-0009-6.pdf (output file already exists and is non-empty)
Skipped 10.1177.0143831X19893761.pdf (output file already exists and is non-empty)
Skipped 10.1525.ap3a.2007.17.1.20.pdf (output file already exists and is non-empty)
Skipped 10.1007.s11138-018-0415-6.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.ijis.2020.07.001.pdf (output file already exists and is non-empty)
Skipped 10.1111.1468-0270.00393.pdf (output file already exists and is non-empty)
Skipped 10.1016.j.respol.2013.08.016.pdf (output file already exists and is non-empty)
Skipped 10.2202.1145-6396.1063.pdf (output file already exists and is non-empty)
Skipp

Processando arquivos:  99%|█████████▉| 1284/1297 [1:22:30<00:05,  2.18file/s]

The process did not finish within 10 minutes and was terminated.
..\data\interim\tei\10.1007.978-3-319-17241-5.pdf.grobid.tei.xml is empty or small. NOT PROCESSED.




Processando arquivos: 100%|██████████| 1297/1297 [1:22:35<00:00,  3.82s/file]

Skipped 10.1007.s10838-023-09640-x.pdf (output file already exists and is non-empty)
Skipped 10.1007.s12108-017-9353-1.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1536-7150.2005.00423.x.pdf (output file already exists and is non-empty)
Skipped 10.1046.j.1365-2753.2003.00402.x.pdf (output file already exists and is non-empty)
Skipped 10.1007.s10657-012-9347-y.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1468-0335.2009.00793.x.pdf (output file already exists and is non-empty)
Skipped 10.1161.JAHA.115.002489.pdf (output file already exists and is non-empty)
Skipped 10.1111.j.1536-7150.2004.00300.x.pdf (output file already exists and is non-empty)
Skipped 10.5465.AMR.2020.0366.pdf (output file already exists and is non-empty)
Skipped 10.1080.0308514032000073392.pdf (output file already exists and is non-empty)



