PyPDF2 is used to extract text from the PDF files and then chunks are created per page and saved in a folder with the name of the chunks as the name of the document with it's page number

In [None]:
import json
import os
import PyPDF2  

In [None]:
from PyPDF2 import PdfReader

def convert_pdfs_to_txt(directory, chunk_size=1000): # Defining the chunk size as 1000
  """
  Iterates through a directory, converts all PDFs to TXTs with each page as a separate file.

  Args:
    directory: Path to the directory containing PDFs.
    chunk_size: Number of lines to write to each TXT file.
  """
  for path, _, files in os.walk(directory):
    for filename in files:
      if filename.endswith(".pdf"):
        pdf_path = os.path.join(path, filename)
        pdf_reader = PdfReader(pdf_path)
        for page_num in range(len(pdf_reader.pages)):
          page = pdf_reader.pages[page_num]
          text = page.extract_text()
          lines = text.splitlines()

          # Create filename with page number
          txt_filename = f"{os.path.splitext(filename)[0]}_page_{page_num + 1}.txt"
          txt_path = os.path.join(chunk_dir, txt_filename)

          with open(txt_path, "w") as txt_file:
            for i in range(0, len(lines), chunk_size):
              txt_file.write("\n".join(lines[i:i + chunk_size]) + "\n")
              # Check if there are remaining lines
              if i + chunk_size < len(lines):
                  # Write remaining lines if any
                  txt_file.write("\n".join(lines[i + chunk_size:]) + "\n")

directory = "../docs/"  #local folder where the pdf files are saved
chunk_dir = "../chunks/"  #local path where chunks are saved
convert_pdfs_to_txt(directory)

# Print the list of pdf files and the processed txt files
print(os.listdir(directory))
print(os.listdir(chunk_dir))



Create a list with the chunks as its dictionary items. Each dictionary item contains key value pairs as ID, Content and Title. Title is to store the name of the document chunk & page number to use as a metadata while indexing for RAG apps.

In [None]:
# chunk_dir is taken from the previous step.

def txt_to_json(chunk_dir, output_file):
  """
  Iterates through a directory of txt files and creates a JSON file with a dictionary.

  Args:
      directory_path: Path to the directory containing the txt files.
      output_file: Path to the output JSON file.
  """
  data = []
  id_counter = 1

  for filename in os.listdir(chunk_dir):
 
    file_path = os.path.join(chunk_dir, filename)
    with open(file_path, "r") as f:
      content = f.read()
    
      data.append({
          "ID": id_counter,
          "Title": filename[:-4],  # Remove ".txt" extension
          "Content": content,
      })
    id_counter += 1

  with open(os.path.join(json_dir, output_file), "w") as f:
    json.dump(data, f, indent=4)



output_file = "data.json" #name of the jsonfile
json_dir = "../jsonfiles/"  # local path to save the json files
txt_to_json(chunk_dir, output_file)
print(f"Successfully created JSON file: {output_file}")
