In [92]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import HTMLHeaderTextSplitter


In [93]:
def split_into_sentences(text):
	sentences = re.split(r'(?<=[.!?])\s+', text.strip())
	return [s.strip() for s in sentences if s.strip()]

def chunk_sentences_by_char_limit(sentences, limit):
	chunks = []
	current_chunk = ""
	for sentence in sentences:
		if len(current_chunk) + len(sentence) + 1 <= limit:
			current_chunk += " " + sentence if current_chunk else sentence
		else:
			chunks.append(current_chunk)
			current_chunk = sentence
	if current_chunk:
		chunks.append(current_chunk)
	return chunks

In [94]:
def create_sections_html(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
):
	"""
	Chunking method using HTMLHeaderTextSplitter with additional splitting for large content.
	"""
	chunk_id_prefix = blob_name.replace(" ", "_").replace(".", "_")
	input_data = []

	# Handle image files
	if blob_name.lower().endswith((".jpg", ".png", ".jpeg")):
		try:
			image_descriptions = get_image_description(blob_name, mode, blob_Connection_String, blob_container_name)
			for idx, content in enumerate(image_descriptions):
				input_data.append({
					'id': f"{chunk_id_prefix}_{idx}",
					'title': blob_name,
					'category': category_id,
					'sourcepage': blob_name_from_file_page(blob_name),
					'content': content
				})
		except Exception as e:
			queue_logger.error(f"Image error for '{blob_name}': {e}")
		return input_data

	# Normalize page_map to (page_num, text) format
	normalized_map = []
	for item in page_map:
		if len(item) == 3:
			page_num, _, text = item
		else:
			raise ValueError(f"Unexpected page_map format: {item}")
		normalized_map.append((page_num, text))

	# Initialize HTMLHeaderTextSplitter
	headers_to_split_on = [
	("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
    ("h4", "Header 4"),
    ("h5", "Header 5"),
    ("h6", "Header 6")
    
	]
	
	html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

	# Initialize RecursiveCharacterTextSplitter for further splitting large chunks
	recursive_splitter = RecursiveCharacterTextSplitter(
		chunk_size=base_threshold,
		chunk_overlap=overlap_sent_count,
		length_function=len,
		separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
	)

	# Process each page
	for page_num, page_text in normalized_map:
		# Convert text to HTML format if it's not already
		html_text = f"<div>{page_text}</div>"
		
		try:
			# First split by HTML headers
			header_chunks = html_splitter.split_text(html_text)
			
			# Process each header chunk
			for header_chunk in header_chunks:
				content = header_chunk.page_content
				metadata = header_chunk.metadata
				
				# If content exceeds base_threshold, split it further
				if len(content) > base_threshold:
					sub_chunks = recursive_splitter.split_text(content)
					
					# Add each sub-chunk with metadata
					for idx, sub_chunk in enumerate(sub_chunks):
						input_data.append({
							'id': f"{chunk_id_prefix}_{page_num}_{len(input_data)}",
							'title': blob_name,
							'category': category_id,
							'sourcepage': f"{blob_name}::{page_num}",
							'content': sub_chunk,
							'metadata': {
								'header': metadata.get('Header', ''),
								'page': page_num,
								'sub_chunk': idx + 1
							}
						})
				else:
					# Add the chunk as is if it's within the threshold
					input_data.append({
						'id': f"{chunk_id_prefix}_{page_num}_{len(input_data)}",
						'title': blob_name,
						'category': category_id,
						'sourcepage': f"{blob_name}::{page_num}",
						'content': content,
						'metadata': {
							'header': metadata.get('Header', ''),
							'page': page_num
						}
					})
					
		except Exception as e:
			queue_logger.error(f"Error processing page {page_num}: {e}")
			# Fallback to basic text splitting if HTML splitting fails
			sentences = split_into_sentences(page_text)
			chunks = chunk_sentences_by_char_limit(sentences, base_threshold)
			
			for idx, chunk in enumerate(chunks):
				input_data.append({
					'id': f"{chunk_id_prefix}_{page_num}_{idx}",
					'title': blob_name,
					'category': category_id,
					'sourcepage': f"{blob_name}::{page_num}",
					'content': chunk
				})

	return input_data

In [95]:
category_id = "FormRechonizer" 
blob_name = "TestCase1"
mode = "search" 
language = "en"
blob_Connection_String = "dummy"
blob_container_name = "dummy"
base_threshold = 10000
buffer_percent = 10
overlap_sent_count = 2


import ast
import json

with open("page_map_FormRechonizer.txt", "r") as file:
    page_map_content = file.read()

# If your file is like [(1, 0, 'text'), ...]
#page_map = ast.literal_eval(page_map_content)
page_map = [(1, 0, page_map_content)]

# Then pass it to your function
result_html = create_sections_html(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
)


In [96]:
print(f"************************************* result_html *************************************\n{result_html}")


************************************* result_html *************************************
[{'id': 'TestCase1_1_0', 'title': 'TestCase1', 'category': 'FormRechonizer', 'sourcepage': 'TestCase1::1', 'content': '[(1, 0, \'Tall Tales\\nA short story collection\\nby Grade 10 \'), (2, 48, \'Contents\\n \'), (3, 1162, \'Judgement\\nby Aletta van der Merwe\\nAs the last rays of sunlight disappeared behind the old abandoned skyscraper, bullets racked its walls and windows, making dull echoing noises inside. Meanwhile, Alexis was busy writing, with her papers scattered all around and Yuki, her white pelt stained with blood and ink, on her lap gently purring. She couldn\\\'t have been in a better mood, yet her green eyes and young face still looked troubled, and this time it wasn\\\'t because of the noise from outside. Throwing down her pen, she pushed her hands through her short dark brown hair and gave a sigh. It had been hard these past days. Having so many questions with no one to answer her, t