### Step:1 Extract the text PDFs

In [2]:
from pathlib import Path
import PyPDF2

In [3]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file.
    :param pdf_path: Path to the PDF file.
    :return: Extracted text as a string.
    """
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

In [4]:
def extract_text_from_pdfs(input_dir, output_dir):
    """
    Extracts text from all PDF files in a directory and saves them to text files.
    :param input_dir: Directory containing PDF files.
    :param output_dir: Directory to save extracted text files.
    """
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    for pdf_file in input_path.glob("*.pdf"):
        print(f"Processing {pdf_file.name}")
        text = extract_text_from_pdf(pdf_file)
        output_file = output_path / f"{pdf_file.stem}.txt"
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(text)
        print(f"Extracted text saved to {output_file}")

In [7]:
# Usage
extract_text_from_pdfs("D:\\src_git\\LP\\LP\\projects\\paper_summarize\\", "D:\\src_git\\LP\\LP\\projects\\paper_summarize\\summary\\" )

Processing kafka.pdf
Extracted text saved to D:\src_git\LP\LP\projects\paper_summarize\summary\kafka.txt
Processing unikernels.pdf
Extracted text saved to D:\src_git\LP\LP\projects\paper_summarize\summary\unikernels.txt
