In [57]:
import sys
import os
from pathlib import Path

# Add the current directory to sys.path
sys.path.append("../..")
from litscan import extract_pdf_text

# Globals, make into command line args.
directory = "../pdfs"
pdf_list_file = "../relevant.pmids"
output_dir = "../text"

In [62]:
def get_pdf_filenames_from_file(file_path):
    """
    Reads a list of PDF filenames from an input file.
    
    Args:
        file_path (str): Path to the input file containing PDF filenames.

    Returns:
        set: A set of PDF filenames (without directory path).
    """
    with open(file_path, "r") as file:
        return {line.strip() for line in file}

def process_pdfs(directory, input_file):
    """
    Processes only the PDFs listed in the input file that exist in the directory.

    Args:
        directory (str): Path to the directory containing PDFs.
        input_file (str): Path to the file containing a list of PDF filenames.
    """
    directory = Path(directory)
    
    # Read the list of valid PDF filenames from input file
    pdf_filenames = get_pdf_filenames_from_file(input_file)

    for pdf_file in directory.glob("*.pdf"):  # Get only .pdf files in the directory
        if pdf_file.name.strip(".pdf") in pdf_filenames:  # Process only if it's in the input file
            print(f"Processing: {pdf_file}")  # Replace with actual processing logic
            text=extract_pdf_text(pdf_file)

            output_file = os.path.join(output_dir, pdf_file.name.strip(".pdf") + ".txt")
            with open(output_file, "w") as file:  # 'w' mode overwrites the file if it exists
                file.write(text)


In [63]:

print(len(get_pdf_filenames_from_file(pdf_list_file)))


1351


In [None]:
text = []
text.append(process_pdfs(directory, pdf_list_file))
print(len(text))

Processing: ../pdfs/10006011.pdf
Processing: ../pdfs/10008687.pdf
Processing: ../pdfs/10012531.pdf
Processing: ../pdfs/10014535.pdf
Processing: ../pdfs/10042429.pdf
Processing: ../pdfs/10049239.pdf
Processing: ../pdfs/10051278.pdf
Processing: ../pdfs/10053023.pdf
Processing: ../pdfs/10059747.pdf
Processing: ../pdfs/10063445.pdf
Processing: ../pdfs/10068152.pdf
Processing: ../pdfs/10070278.pdf
Processing: ../pdfs/10070996.pdf
Processing: ../pdfs/10071500.pdf
Processing: ../pdfs/10079969.pdf
Processing: ../pdfs/10093103.pdf
Processing: ../pdfs/10106413.pdf
Processing: ../pdfs/10107112.pdf
Processing: ../pdfs/10133299.pdf
Processing: ../pdfs/10140042.pdf
Processing: ../pdfs/10148064.pdf
Processing: ../pdfs/10153752.pdf
Processing: ../pdfs/10155825.pdf
Processing: ../pdfs/10165364.pdf
Processing: ../pdfs/10169864.pdf
Processing: ../pdfs/10175245.pdf
Processing: ../pdfs/10183554.pdf
Processing: ../pdfs/10187000.pdf
Processing: ../pdfs/10187297.pdf
Processing: ../pdfs/10191186.pdf
Processing

incorrect startxref pointer(2)


Processing: ../pdfs/10208015.pdf
Processing: ../pdfs/10217178.pdf
Processing: ../pdfs/10218234.pdf
Processing: ../pdfs/10236629.pdf
Processing: ../pdfs/10242111.pdf
Processing: ../pdfs/10249030.pdf
Processing: ../pdfs/10252496.pdf
Processing: ../pdfs/10262772.pdf
Processing: ../pdfs/10279647.pdf
Processing: ../pdfs/10281878.pdf
Processing: ../pdfs/10287916.pdf
Processing: ../pdfs/10290629.pdf
Processing: ../pdfs/10292663.pdf
Processing: ../pdfs/10292888.pdf
Processing: ../pdfs/10298926.pdf
Processing: ../pdfs/10302480.pdf
Processing: ../pdfs/10304724.pdf
Processing: ../pdfs/10315745.pdf
Processing: ../pdfs/10325098.pdf
Processing: ../pdfs/10325925.pdf
Processing: ../pdfs/10326603.pdf
Processing: ../pdfs/10341502.pdf
Processing: ../pdfs/10343678.pdf
Processing: ../pdfs/10351475.pdf
Processing: ../pdfs/10362224.pdf
Processing: ../pdfs/10373506.pdf
Processing: ../pdfs/10385182.pdf
Processing: ../pdfs/10385642.pdf
Processing: ../pdfs/10400078.pdf
Processing: ../pdfs/10412483.pdf
Processing



Processing: ../pdfs/3814310.pdf
Processing: ../pdfs/3814391.pdf
Processing: ../pdfs/3837099.pdf
Processing: ../pdfs/3844290.pdf
Processing: ../pdfs/3853299.pdf
Processing: ../pdfs/3858899.pdf
Processing: ../pdfs/3873250.pdf
Processing: ../pdfs/3890808.pdf
Processing: ../pdfs/3896626.pdf
Processing: ../pdfs/3907295.pdf
Processing: ../pdfs/3936505.pdf
Processing: ../pdfs/3937641.pdf
Processing: ../pdfs/3962722.pdf
Processing: ../pdfs/3982150.pdf
Processing: ../pdfs/3988321.pdf
Processing: ../pdfs/4006353.pdf
Processing: ../pdfs/4023931.pdf
Processing: ../pdfs/4031557.pdf
Processing: ../pdfs/4040709.pdf
Processing: ../pdfs/4048339.pdf
Processing: ../pdfs/4049325.pdf
Processing: ../pdfs/4051888.pdf
Processing: ../pdfs/4065146.pdf
Processing: ../pdfs/4065404.pdf
Processing: ../pdfs/4066805.pdf
Processing: ../pdfs/4085727.pdf
Processing: ../pdfs/4091169.pdf
Processing: ../pdfs/4113420.pdf
Processing: ../pdfs/4132783.pdf
Processing: ../pdfs/4160428.pdf
Processing: ../pdfs/4166379.pdf
Processi

Invalid parent xref., rebuild xref


Processing: ../pdfs/4303889.pdf
Processing: ../pdfs/4347643.pdf
Processing: ../pdfs/4350902.pdf
Processing: ../pdfs/4355538.pdf
Processing: ../pdfs/4366900.pdf
Processing: ../pdfs/4366949.pdf
Processing: ../pdfs/4369404.pdf
Processing: ../pdfs/4374139.pdf
Processing: ../pdfs/4385936.pdf
Processing: ../pdfs/4386448.pdf
Processing: ../pdfs/4398628.pdf
Processing: ../pdfs/4413438.pdf
Processing: ../pdfs/4424361.pdf
Processing: ../pdfs/4428774.pdf
Processing: ../pdfs/4430267.pdf
Processing: ../pdfs/4453952.pdf
Processing: ../pdfs/4454319.pdf
Processing: ../pdfs/4454526.pdf
Processing: ../pdfs/4462357.pdf
Processing: ../pdfs/4463680.pdf
Processing: ../pdfs/4499145.pdf
Processing: ../pdfs/4500567.pdf
Processing: ../pdfs/4502706.pdf
Processing: ../pdfs/4509444.pdf
Processing: ../pdfs/4509454.pdf
Processing: ../pdfs/4518712.pdf
Processing: ../pdfs/4529964.pdf
Processing: ../pdfs/4563582.pdf
Processing: ../pdfs/4571882.pdf
Processing: ../pdfs/4573855.pdf
Processing: ../pdfs/4605295.pdf
Processi

Multiple definitions in dictionary at byte 0x2e6be for key /MediaBox
Multiple definitions in dictionary at byte 0x2e90e for key /MediaBox
Multiple definitions in dictionary at byte 0x2eb07 for key /MediaBox
Multiple definitions in dictionary at byte 0x2ec80 for key /MediaBox
Multiple definitions in dictionary at byte 0x2ee76 for key /MediaBox
Multiple definitions in dictionary at byte 0x2f04c for key /MediaBox
Multiple definitions in dictionary at byte 0x2f1f2 for key /MediaBox
Multiple definitions in dictionary at byte 0x2f3c3 for key /MediaBox
Multiple definitions in dictionary at byte 0x2f569 for key /MediaBox
Multiple definitions in dictionary at byte 0x2f717 for key /MediaBox
Multiple definitions in dictionary at byte 0x2f889 for key /MediaBox
Multiple definitions in dictionary at byte 0x2fa23 for key /MediaBox
Multiple definitions in dictionary at byte 0x2fcdc for key /MediaBox
Multiple definitions in dictionary at byte 0x2fefe for key /MediaBox
Multiple definitions in dictionary

Processing: ../pdfs/5741252.pdf
Processing: ../pdfs/5742284.pdf
Processing: ../pdfs/5751207.pdf
Processing: ../pdfs/5760035.pdf
Processing: ../pdfs/5772365.pdf
Processing: ../pdfs/5777528.pdf
Processing: ../pdfs/5778871.pdf
Processing: ../pdfs/5801751.pdf
Processing: ../pdfs/5806701.pdf
Processing: ../pdfs/5811346.pdf
Processing: ../pdfs/5826808.pdf
Processing: ../pdfs/5826840.pdf
Processing: ../pdfs/5833651.pdf
Processing: ../pdfs/5836413.pdf
Processing: ../pdfs/5838979.pdf
Processing: ../pdfs/5852553.pdf
Processing: ../pdfs/5879922.pdf
Processing: ../pdfs/5888579.pdf
Processing: ../pdfs/5890823.pdf
Processing: ../pdfs/5895775.pdf
Processing: ../pdfs/5902222.pdf
Processing: ../pdfs/5903426.pdf
Processing: ../pdfs/5905826.pdf
Processing: ../pdfs/5910004.pdf
Processing: ../pdfs/5910822.pdf
Processing: ../pdfs/5935120.pdf
Processing: ../pdfs/5937186.pdf
Processing: ../pdfs/5941560.pdf
Processing: ../pdfs/5964100.pdf
Processing: ../pdfs/5964191.pdf
Processing: ../pdfs/5970242.pdf
Processi

Multiple definitions in dictionary at byte 0x22894 for key /MediaBox
Multiple definitions in dictionary at byte 0x22a77 for key /MediaBox
Multiple definitions in dictionary at byte 0x22c2d for key /MediaBox
Multiple definitions in dictionary at byte 0x22db3 for key /MediaBox
Multiple definitions in dictionary at byte 0x22f69 for key /MediaBox
Multiple definitions in dictionary at byte 0x231af for key /MediaBox
Multiple definitions in dictionary at byte 0x23308 for key /MediaBox
Multiple definitions in dictionary at byte 0x23461 for key /MediaBox
Multiple definitions in dictionary at byte 0x235ba for key /MediaBox
Multiple definitions in dictionary at byte 0x23733 for key /MediaBox
Multiple definitions in dictionary at byte 0x238f1 for key /MediaBox
Multiple definitions in dictionary at byte 0x23b0a for key /MediaBox
Multiple definitions in dictionary at byte 0x23d0b for key /MediaBox
Multiple definitions in dictionary at byte 0x23f84 for key /MediaBox


Processing: ../pdfs/6410659.pdf
Processing: ../pdfs/6428753.pdf
Processing: ../pdfs/6442302.pdf
Processing: ../pdfs/6442620.pdf
Processing: ../pdfs/6447890.pdf
Processing: ../pdfs/6451104.pdf
Processing: ../pdfs/6463782.pdf
Processing: ../pdfs/6468170.pdf
Processing: ../pdfs/6484314.pdf
Processing: ../pdfs/6491582.pdf
Processing: ../pdfs/6503771.pdf
Processing: ../pdfs/6504209.pdf
Processing: ../pdfs/6512023.pdf
Processing: ../pdfs/6526809.pdf
Processing: ../pdfs/6538666.pdf
Processing: ../pdfs/6538672.pdf
Processing: ../pdfs/6547018.pdf
Processing: ../pdfs/6558473.pdf
Processing: ../pdfs/6573989.pdf
Processing: ../pdfs/6589683.pdf
Processing: ../pdfs/6596405.pdf
Processing: ../pdfs/6601508.pdf
Processing: ../pdfs/6605786.pdf
Processing: ../pdfs/6606338.pdf
Processing: ../pdfs/6606649.pdf
Processing: ../pdfs/6611005.pdf
Processing: ../pdfs/6611899.pdf
Processing: ../pdfs/6616334.pdf
Processing: ../pdfs/6625828.pdf


Multiple definitions in dictionary at byte 0x1b901 for key /MediaBox
Multiple definitions in dictionary at byte 0x1bce0 for key /MediaBox
Multiple definitions in dictionary at byte 0x1beaf for key /MediaBox
Multiple definitions in dictionary at byte 0x1c359 for key /MediaBox
Multiple definitions in dictionary at byte 0x1c540 for key /MediaBox
Multiple definitions in dictionary at byte 0x1c7a2 for key /MediaBox
Multiple definitions in dictionary at byte 0x1ca4c for key /MediaBox
Multiple definitions in dictionary at byte 0x1cc4e for key /MediaBox


Processing: ../pdfs/6646425.pdf
Processing: ../pdfs/6648349.pdf
Processing: ../pdfs/6649855.pdf
Processing: ../pdfs/6652928.pdf
Processing: ../pdfs/6666396.pdf
Processing: ../pdfs/6687620.pdf
Processing: ../pdfs/6687843.pdf
Processing: ../pdfs/6693454.pdf
Processing: ../pdfs/6696456.pdf
Processing: ../pdfs/6701435.pdf
Processing: ../pdfs/6707996.pdf
Processing: ../pdfs/6708234.pdf
Processing: ../pdfs/6717232.pdf
Processing: ../pdfs/6741846.pdf


Multiple definitions in dictionary at byte 0x1b0d9 for key /MediaBox
Multiple definitions in dictionary at byte 0x1b4d0 for key /MediaBox
Multiple definitions in dictionary at byte 0x1b8b2 for key /MediaBox
Multiple definitions in dictionary at byte 0x1bb41 for key /MediaBox
Multiple definitions in dictionary at byte 0x1be1b for key /MediaBox
Multiple definitions in dictionary at byte 0x1c0ad for key /MediaBox
Multiple definitions in dictionary at byte 0x1c317 for key /MediaBox


Processing: ../pdfs/6743340.pdf
Processing: ../pdfs/6746689.pdf
Processing: ../pdfs/6753493.pdf
Processing: ../pdfs/6764946.pdf
Processing: ../pdfs/6769516.pdf
Processing: ../pdfs/6775667.pdf


Multiple definitions in dictionary at byte 0x22c27 for key /MediaBox
Multiple definitions in dictionary at byte 0x22e4b for key /MediaBox
Multiple definitions in dictionary at byte 0x2301a for key /MediaBox
Multiple definitions in dictionary at byte 0x231c9 for key /MediaBox
Multiple definitions in dictionary at byte 0x233b0 for key /MediaBox
Multiple definitions in dictionary at byte 0x2354f for key /MediaBox
Multiple definitions in dictionary at byte 0x236e6 for key /MediaBox
Multiple definitions in dictionary at byte 0x23980 for key /MediaBox
Multiple definitions in dictionary at byte 0x23b7a for key /MediaBox
Multiple definitions in dictionary at byte 0x23dc4 for key /MediaBox
Multiple definitions in dictionary at byte 0x24036 for key /MediaBox


Processing: ../pdfs/6779824.pdf
Processing: ../pdfs/6781674.pdf
Processing: ../pdfs/6781985.pdf
Processing: ../pdfs/6791307.pdf
Processing: ../pdfs/6797809.pdf
Processing: ../pdfs/6801922.pdf
Processing: ../pdfs/6814607.pdf
Processing: ../pdfs/6819422.pdf
Processing: ../pdfs/6837739.pdf
Processing: ../pdfs/6839104.pdf
Processing: ../pdfs/6844504.pdf
Processing: ../pdfs/6847725.pdf
Processing: ../pdfs/6873185.pdf
Processing: ../pdfs/6874311.pdf
Processing: ../pdfs/6886351.pdf
Processing: ../pdfs/6898758.pdf
Processing: ../pdfs/6911001.pdf


Multiple definitions in dictionary at byte 0x27fd7 for key /MediaBox
Multiple definitions in dictionary at byte 0x281b0 for key /MediaBox
Multiple definitions in dictionary at byte 0x28362 for key /MediaBox
Multiple definitions in dictionary at byte 0x28554 for key /MediaBox
Multiple definitions in dictionary at byte 0x28736 for key /MediaBox
Multiple definitions in dictionary at byte 0x288e0 for key /MediaBox
Multiple definitions in dictionary at byte 0x28afa for key /MediaBox
Multiple definitions in dictionary at byte 0x28cb1 for key /MediaBox
Multiple definitions in dictionary at byte 0x28e23 for key /MediaBox
Multiple definitions in dictionary at byte 0x28f85 for key /MediaBox
Multiple definitions in dictionary at byte 0x2917f for key /MediaBox
Multiple definitions in dictionary at byte 0x293e1 for key /MediaBox
Multiple definitions in dictionary at byte 0x295a3 for key /MediaBox


Processing: ../pdfs/6933010.pdf
Processing: ../pdfs/6942053.pdf
Processing: ../pdfs/6952610.pdf
Processing: ../pdfs/6959288.pdf
Processing: ../pdfs/6966434.pdf
Processing: ../pdfs/6984605.pdf
Processing: ../pdfs/6984615.pdf
Processing: ../pdfs/6990542.pdf
Processing: ../pdfs/6993210.pdf
Processing: ../pdfs/7001145.pdf
Processing: ../pdfs/7016097.pdf
Processing: ../pdfs/7020595.pdf
Processing: ../pdfs/7026052.pdf
Processing: ../pdfs/7028996.pdf
Processing: ../pdfs/7036264.pdf
Processing: ../pdfs/7043703.pdf
Processing: ../pdfs/7047589.pdf
Processing: ../pdfs/7060797.pdf
Processing: ../pdfs/7072512.pdf
Processing: ../pdfs/7072626.pdf
Processing: ../pdfs/7073305.pdf
Processing: ../pdfs/7082962.pdf
Processing: ../pdfs/7085117.pdf
Processing: ../pdfs/7094861.pdf
Processing: ../pdfs/7178533.pdf
Processing: ../pdfs/7183222.pdf
Processing: ../pdfs/7192953.pdf
Processing: ../pdfs/7197349.pdf
Processing: ../pdfs/7202976.pdf
Processing: ../pdfs/7206129.pdf
Processing: ../pdfs/7214144.pdf
Processi

In [61]:
os.path.join("a","b")

'a/b'