In [None]:
import fitz  # PyMuPDF
import tiktoken
import spacy
import json
import os

# Load spaCy for sentence splitting
nlp = spacy.load("en_core_web_sm")

# Load OpenAI tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")  # Use the appropriate tokenizer for your LLM (e.g., GPT-3, GPT-4).

def count_tokens(text):
    """
    Count the number of tokens in a given text using the OpenAI tokenizer.
    """
    return len(tokenizer.encode(text))


def split_sentences_spacy(text):
    """
    Split the input text into sentences using spaCy's sentence boundary detection.
    """
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]


def split_long_sentence(sentence, max_tokens=500):
    """
    Safely split a very long sentence into smaller chunks using the OpenAI tokenizer.
    Ensures no chunk exceeds the maximum token limit.
    """
    token_ids = tokenizer.encode(sentence)
    chunks = []
    for i in range(0, len(token_ids), max_tokens):
        token_chunk = tokenizer.decode(token_ids[i:i + max_tokens])
        chunks.append(token_chunk.strip())
    return chunks


def chunk_text(text, max_tokens=500):
    """
    Chunk the input text into smaller pieces using token limits.
    Combines sentences into chunks while respecting the token limit.
    """
    sentences = split_sentences_spacy(text)
    chunks = []
    current_chunk = ""
    current_tokens = 0

    for sent in sentences:
        sent_tokens = count_tokens(sent)

        # Handle sentences that exceed the max token limit
        if sent_tokens > max_tokens:
            sub_chunks = split_long_sentence(sent, max_tokens)
            chunks.extend(sub_chunks)
            continue

        # Add sentences to the current chunk without exceeding the token limit
        if current_tokens + sent_tokens <= max_tokens:
            current_chunk += " " + sent
            current_tokens += sent_tokens
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sent
            current_tokens = sent_tokens

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks


def extract_pdf_content(pdf_path):
    """
    Extract text content from a PDF document using PyMuPDF.
    """
    doc = fitz.open(pdf_path)
    extracted_content = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text()  # Extract text from the current page

        if text.strip():  # Skip empty pages
            extracted_content.append({
                "page_number": page_num + 1,
                "text": text
            })

    return extracted_content


def process_pdf_for_training(pdf_path, output_dir="output_data", max_tokens=500):
    """
    Extract, chunk, and tokenize text from a PDF for LLM training.
    Saves the result in JSON format for further use.
    """
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Extract text from the PDF
    pdf_content = extract_pdf_content(pdf_path)

    # Initialize processed data for training
    training_data = []

    for page in pdf_content:
        page_number = page["page_number"]
        text = page["text"]

        # Chunk and tokenize the text
        chunks = chunk_text(text, max_tokens)

        # Append each chunk as training data
        for idx, chunk in enumerate(chunks):
            training_data.append({
                "page_number": page_number,
                "chunk_number": idx + 1,
                "chunk_text": chunk
            })

    # Save the processed data to a JSON file
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_path = os.path.join(output_dir, f"{pdf_name}_training_data.json")

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(training_data, f, indent=4, ensure_ascii=False)

    print(f"Processed data saved to {output_path}")


if __name__ == "__main__":
    # Example usage
    input_pdf = "example.pdf"  # Replace with the path to your PDF file
    process_pdf_for_training(input_pdf, max_tokens=500)