In [None]:
!pip install pymupdf4llm
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting pymupdf4llm
  Downloading pymupdf4llm-0.0.17-py3-none-any.whl.metadata (4.1 kB)
Collecting pymupdf>=1.24.10 (from pymupdf4llm)
  Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf4llm-0.0.17-py3-none-any.whl (26 kB)
Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf, pymupdf4llm
Successfully installed pymupdf-1.25.2 pymupdf4llm-0.0.17
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the pack

In [None]:
import os
import spacy
import pymupdf4llm

def convert_pdfs_to_sentences(input_folder, output_file):
    """
    1. Iterates through all .pdf files in `input_folder`.
    2. Converts each PDF to markdown text using pymupdf4llm.
    3. Removes all newline characters.
    4. Uses spaCy to split the text into sentences.
    5. Writes all sentences from all PDFs into `output_file`, one sentence per line.
    """

    # Load the spaCy English model once
    nlp = spacy.load("en_core_web_sm")

    # Open the output file
    with open(output_file, "w", encoding="utf-8") as out_f:
        # Iterate over each file in the input folder
        for filename in os.listdir(input_folder):
            if filename.lower().endswith(".pdf"):
                pdf_path = os.path.join(input_folder, filename)

                # Convert PDF to text (markdown format) using pymupdf4llm
                text = pymupdf4llm.to_markdown(pdf_path)

                # Remove all newline characters
                text_no_newlines = text.replace("\n", " ")

                # Use spaCy to split text into sentences
                doc = nlp(text_no_newlines)

                # Write each identified sentence to the output file
                for sent in doc.sents:
                    sentence_text = sent.text.strip()
                    if sentence_text:
                        out_f.write(sentence_text + "\n")

    print(f"Sentences from all PDFs in '{input_folder}' have been written to '{output_file}'.")



# Specify the folder containing your PDFs
input_folder_path = "papers"

# Specify the desired output file
output_file_path = "combined_sentences.txt"

# Run the conversion pipeline
convert_pdfs_to_sentences(input_folder_path, output_file_path)


Processing papers/li-et-al-2010-association-mapping-of-local-climate-sensitive-quantitative-trait-loci-in-arabidopsis-thaliana.pdf...
Processing papers/2020.10.21.348912v1.full.pdf...
Processing papers/A Coastal Cline in Sodium Accumulation in Arabidopsis.pdf...
Processing papers/Genome-wide Association Mapping Identifies a New.pdf...
Processing papers/Genome-Wide Association Studies Identify Heavy Metal.pdf...
Processing papers/fpls-12-689316.pdf...
Processing papers/460360v1.full.pdf...
Processing papers/2023.05.15.540763v1.full.pdf...
Sentences from all PDFs in 'papers' have been written to 'combined_sentences.txt'.


In [None]:
import os
import spacy
import pymupdf4llm

def convert_pdfs_to_sentences_separate_files(input_folder, output_folder):
    """
    1. Iterates through all .pdf files in `input_folder`.
    2. Converts each PDF to markdown text using pymupdf4llm.
    3. Removes all newline characters.
    4. Uses spaCy to split the text into sentences.
    5. Writes sentences for each PDF into its own .txt file in `output_folder`.
       The output file has the same base name as the PDF but with a .txt extension.
    """

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Load the spaCy English model once
    nlp = spacy.load("en_core_web_sm")

    # Iterate over each file in the input folder
    for filename in os.listdir(input_folder):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(input_folder, filename)

            # Convert PDF to text (markdown format) using pymupdf4llm
            text = pymupdf4llm.to_markdown(pdf_path)

            # Remove all newline characters
            text_no_newlines = text.replace("\n", " ")

            # Use spaCy to split text into sentences
            doc = nlp(text_no_newlines)

            # Create the output file name (replace .pdf with .txt)
            base_name = os.path.splitext(filename)[0]
            output_file = os.path.join(output_folder, base_name + ".txt")

            # Write sentences to the output file
            with open(output_file, "w", encoding="utf-8") as out_f:
                for sent in doc.sents:
                    sentence_text = sent.text.strip()
                    if sentence_text:
                        out_f.write(sentence_text + "\n")

            print(f"Sentences for '{filename}' have been written to '{output_file}'.")


# Example usage:
if __name__ == "__main__":
    # Specify the folder containing your PDFs
    input_folder_path = "papers"

    # Specify an output folder for the individual TXT files
    output_folder_path = "text_outputs"

    # Run the conversion pipeline
    convert_pdfs_to_sentences_separate_files(input_folder_path, output_folder_path)


Processing papers/li-et-al-2010-association-mapping-of-local-climate-sensitive-quantitative-trait-loci-in-arabidopsis-thaliana.pdf...
Sentences for 'li-et-al-2010-association-mapping-of-local-climate-sensitive-quantitative-trait-loci-in-arabidopsis-thaliana.pdf' have been written to 'text_outputs/li-et-al-2010-association-mapping-of-local-climate-sensitive-quantitative-trait-loci-in-arabidopsis-thaliana.txt'.
Processing papers/2020.10.21.348912v1.full.pdf...
Sentences for '2020.10.21.348912v1.full.pdf' have been written to 'text_outputs/2020.10.21.348912v1.full.txt'.
Processing papers/A Coastal Cline in Sodium Accumulation in Arabidopsis.pdf...
Sentences for 'A Coastal Cline in Sodium Accumulation in Arabidopsis.pdf' have been written to 'text_outputs/A Coastal Cline in Sodium Accumulation in Arabidopsis.txt'.
Processing papers/Genome-wide Association Mapping Identifies a New.pdf...
Sentences for 'Genome-wide Association Mapping Identifies a New.pdf' have been written to 'text_outputs/

In [None]:
import os

def combine_txt_files(input_folder, output_file):
    """
    Combines all .txt files from the given 'input_folder'
    into a single output file specified by 'output_file'.
    Each line in the individual files is preserved in the output file.
    """
    with open(output_file, "w", encoding="utf-8") as outfile:
        # Sort file names so they are combined in a consistent order
        for filename in sorted(os.listdir(input_folder)):
            if filename.lower().endswith(".txt"):
                file_path = os.path.join(input_folder, filename)
                with open(file_path, "r", encoding="utf-8") as infile:
                    for line in infile:
                        # Strip and re-add newline to ensure clean lines
                        line_stripped = line.strip()
                        if line_stripped:
                            outfile.write(line_stripped + "\n")

    print(f"All .txt files in '{input_folder}' have been combined into '{output_file}'.")

if __name__ == "__main__":
    # Specify the folder containing the .txt files
    text_outputs_folder = "text_outputs"

    # Specify the single output file name
    combined_output_file = "all_sentences_combined.txt"

    # Run the combining function
    combine_txt_files(text_outputs_folder, combined_output_file)


All .txt files in 'text_outputs' have been combined into 'all_sentences_combined.txt'.
