# Get Data ready for Feeding

In [1]:
def generate_unique_id(title, page_number, chunk_number, format):
    clean_title = ''.join(char.lower() for char in title if char.isalnum())
    padded_page = str(page_number).zfill(3)
    padded_chunk = str(chunk_number).zfill(3)
    return f"{format}-{clean_title}-p{padded_page}-c{padded_chunk}"

def transform_headings(headings: list[tuple[str, int]]) -> tuple[str, list[int]]:
   headings_string = ""
   levels = []
   
   for heading in headings:
       headings_string += heading[0] + "; "
       levels.append(heading[1])
       
   return headings_string, levels

def create_vespa_doc(doc):
    return {
        "put": f"id:waku-search:doc::{doc['doc_id']}",
        "fields": {
            **doc
        }
    }

def preprocessing(docs, format, chunker=None):
    processed_docs = []
    
    for doc in docs:
        
        # chunking
        if chunker:
            # Create chunks using the provided chunker
            for chunk_number, chunk in enumerate(chunker.split_text(doc["text"]), 1):
                new_doc = {**doc}
                new_doc["text_format"] = format
                new_doc["chunk_number"] = chunk_number
                new_doc["text"] = chunk
                
                # ID generation
                new_doc["doc_id"] = generate_unique_id(
                    new_doc["title"], 
                    new_doc["page_number"], 
                    new_doc["chunk_number"],
                    new_doc["text_format"] 
                )
                
                # Transform Headings
                headings, levels = transform_headings(new_doc["headings"])
                new_doc["headings"] = headings
                new_doc["heading_levels"] = levels 
                
                # Create Vespa document
                vespa_doc = create_vespa_doc(new_doc)
                processed_docs.append(vespa_doc)
        else:
            # If no chunker provided, process the entire document as one chunk
            new_doc = {**doc}
            new_doc["text_format"] = format
            new_doc["chunk_number"] = 1
            
            # ID generation
            new_doc["doc_id"] = generate_unique_id(
                new_doc["title"], 
                new_doc["page_number"], 
                new_doc["chunk_number"],
                new_doc["text_format"]
            )
            
            # Transform Headings
            headings, levels = transform_headings(new_doc["headings"])
            new_doc["headings"] = headings
            new_doc["heading_levels"] = levels 
           
            # Create Vespa document
            vespa_doc = create_vespa_doc(new_doc)
            processed_docs.append(vespa_doc)
            
    return processed_docs

In [2]:
import json

with open("ocr.json", "r") as f:
    ocr = json.load(f)
    
with open("markdown.json", "r") as f:
    markdown = json.load(f)

In [3]:
import json
from langchain.text_splitter import MarkdownTextSplitter, RecursiveCharacterTextSplitter

formats = ["ocr", "markdown", "chunked_ocr", "chunked_markdown"]

# Define chunker configurations once
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50

for format in formats:
    documents = []
    chunker = None
    if "chunked" in format:
        if "markdown" in format:
            chunker = MarkdownTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
        else:
            chunker = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    
    # Select document source
    docs = ocr if "ocr" in format else markdown
    
    # Process documents
    processed_docs = preprocessing(docs, format, chunker)
    
    # Save results
    output_path = f"vespa-feed/{format}_docs.json"
    with open(output_path, "w") as f:
        json.dump(processed_docs, f)