In [44]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np
from typing import List, Tuple
import tiktoken
import re
import numpy as np

# Load the HuggingFace MiniLM-L6-v2 model
hf_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def split_into_sentences(text):
	sentences = re.split(r'(?<=[.!?])\s+', text.strip())
	return [s.strip() for s in sentences if s.strip()]

def chunk_sentences_by_char_limit(sentences, limit):
	chunks = []
	current_chunk = ""
	for sentence in sentences:
		if len(current_chunk) + len(sentence) + 1 <= limit:
			current_chunk += " " + sentence if current_chunk else sentence
		else:
			chunks.append(current_chunk)
			current_chunk = sentence
	if current_chunk:
		chunks.append(current_chunk)
	return chunks

def get_openai_embedding(text: str) -> List[float]:
    """Get embedding from OpenAI API"""
    try:
        response = openai.Embedding.create(
            input=text,
            engine=EMBEDDING_MODEL_DEPLOYMENT_NAME
        )
        return response['data'][0]['embedding']
    except Exception as e:
        queue_logger.error(f"Error getting OpenAI embedding: {str(e)}")
        return None

def get_optimal_clusters(embeddings: np.ndarray, max_clusters: int = 10) -> int:
    """Determine optimal number of clusters using elbow method"""
    distortions = []
    K = range(1, min(max_clusters + 1, len(embeddings)))
    for k in K:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(embeddings)
        distortions.append(kmeans.inertia_)
    
    # Find the elbow point
    if len(distortions) > 1:
        # Calculate the rate of change of distortions
        deltas = np.diff(distortions)
        # Find the point of maximum change
        optimal_k = np.argmax(deltas) + 1
        return optimal_k
    return 1

def create_sections_embedding(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
):
	chunk_id_prefix = blob_name.replace(" ", "_").replace(".", "_")
	input_data = []

	if blob_name.lower().endswith((".jpg", ".png", ".jpeg")):
		try:
			image_descriptions = get_image_description(blob_name, mode, blob_Connection_String, blob_container_name)
			for idx, content in enumerate(image_descriptions):
				input_data.append({
					'id': f"{chunk_id_prefix}_{idx}",
					'title': blob_name,
					'category': category_id,
					'sourcepage': blob_name_from_file_page(blob_name),
					'content': content
				})
		except Exception as e:
			queue_logger.error(f"Image error for '{blob_name}': {e}")
	else:
		try:
			# Prepare text and page information
			sentences = []
			sentence_to_page = []
			
			for page_num, _, text in page_map:
				# Split text into sentences
				page_sentences = split_into_sentences(text)
				sentences.extend(page_sentences)
				sentence_to_page.extend([page_num] * len(page_sentences))
			
			if not sentences:
				return []

			# Generate embeddings for all sentences using OpenAI
			embeddings = []
			for sentence in sentences:
				try:
					embedding = hf_model.encode(sentence, convert_to_numpy=True)
					embeddings.append(embedding)
				except Exception as e:
					print(f"Embedding failed for sentence: {sentence}, error: {e}")
					# Fallback to zero vector with 384 dimensions (MiniLM-L6-v2)
					embeddings.append(np.zeros(384))
					
			embeddings = np.array(embeddings)
			
			# Determine optimal number of clusters
			n_clusters = get_optimal_clusters(embeddings)
			
			# Perform KMeans clustering
			kmeans = KMeans(n_clusters=n_clusters, random_state=42)
			clusters = kmeans.fit_predict(embeddings)
			
			# Group sentences by cluster
			cluster_sentences = {}
			cluster_pages = {}
			for idx, cluster_id in enumerate(clusters):
				if cluster_id not in cluster_sentences:
					cluster_sentences[cluster_id] = []
					cluster_pages[cluster_id] = set()
				cluster_sentences[cluster_id].append(sentences[idx])
				cluster_pages[cluster_id].add(sentence_to_page[idx])
			
			# Create chunks from clusters
			for cluster_id, cluster_texts in cluster_sentences.items():
				# Get page range for this cluster
				pages = sorted(cluster_pages[cluster_id])
				start_page = pages[0]
				end_page = pages[-1]
				page_range = f"{start_page}-{end_page}" if start_page != end_page else str(start_page)
				
				# Combine sentences in cluster
				chunk_text = " ".join(cluster_texts)
				
				# If chunk is too large, split it further
				if len(chunk_text) > base_threshold:
					# Split into smaller chunks while maintaining semantic coherence
					sub_chunks = []
					current_chunk = []
					current_length = 0
					
					for sentence in cluster_texts:
						sentence_length = len(sentence)
						if current_length + sentence_length <= base_threshold:
							current_chunk.append(sentence)
							current_length += sentence_length
						else:
							if current_chunk:
								sub_chunks.append(" ".join(current_chunk))
							current_chunk = [sentence]
							current_length = sentence_length
					
					if current_chunk:
						sub_chunks.append(" ".join(current_chunk))
					
					# Add sub-chunks to input data
					for idx, sub_chunk in enumerate(sub_chunks):
						input_data.append({
							'id': f"{chunk_id_prefix}_{page_range}_{cluster_id}_{idx}",
							'title': blob_name,
							'category': category_id,
							'sourcepage': f"{blob_name}::{page_range}",
							'content': sub_chunk.strip()
						})
				else:
					# Add single chunk to input data
					input_data.append({
						'id': f"{chunk_id_prefix}_{page_range}_{cluster_id}",
						'title': blob_name,
						'category': category_id,
						'sourcepage': f"{blob_name}::{page_range}",
						'content': chunk_text.strip()
					})

		except Exception as e:
			queue_logger.error(f"Error processing file '{blob_name}' with OpenAI embedding-based clustering: {str(e)}")
			return []

	return input_data

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [45]:
import os
import re
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import OpenAIEmbeddings
from typing import List, Dict, Tuple
from langchain.embeddings import HuggingFaceEmbeddings

def split_into_sentences(text):
	sentences = re.split(r'(?<=[.!?])\s+', text.strip())
	return [s.strip() for s in sentences if s.strip()]

def chunk_sentences_by_char_limit(sentences, limit):
	chunks = []
	current_chunk = ""
	for sentence in sentences:
		if len(current_chunk) + len(sentence) + 1 <= limit:
			current_chunk += " " + sentence if current_chunk else sentence
		else:
			chunks.append(current_chunk)
			current_chunk = sentence
	if current_chunk:
		chunks.append(current_chunk)
	return chunks

def create_sections_chunker(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
):
	chunk_id_prefix = blob_name.replace(" ", "_").replace(".", "_")
	input_data = []

	if blob_name.lower().endswith((".jpg", ".png", ".jpeg")):
		try:
			image_descriptions = get_image_description(blob_name, mode, blob_Connection_String, blob_container_name)
			for idx, content in enumerate(image_descriptions):
				input_data.append({
					'id': f"{chunk_id_prefix}_{idx}",
					'title': blob_name,
					'category': category_id,
					'sourcepage': blob_name_from_file_page(blob_name),
					'content': content
				})
		except Exception as e:
			queue_logger.error(f"Image error for '{blob_name}': {e}")
	else:
		try:
			# Get OpenAI API key from environment or Azure Key Vault
			try:
				openai_api_key = os.getenv("OPENAI_API_KEY")
				if not openai_api_key:
					# Try to get from Azure Key Vault
					openai_api_key = KeyVaultManager.fetch_tag_value_from_key_and_tag(
						DOCUAGENT_KEYVAULT_NAME, 
						'AZURE_OPENAI_API_KEY'
					)
			except Exception as e:
				queue_logger.error(f"Failed to get OpenAI API key: {str(e)}")
				return []

			# Initialize OpenAI embeddings
			embeddings = HuggingFaceEmbeddings(
    		model_name="sentence-transformers/all-MiniLM-L6-v2"
			)

			# Initialize SemanticChunker
			text_splitter = SemanticChunker(
				embeddings=embeddings,
				min_chunk_size=base_threshold,
				#chunk_overlap=int(base_threshold * buffer_percent / 100)
			)

			# Combine all text while maintaining page tracking
			all_text = ""
			page_positions = []  # List of (start_pos, end_pos, page_num)
			current_pos = 0

			for page_num, _, text in page_map:
				cleaned_text = re.sub(r'\s+', ' ', text).strip()
				if cleaned_text:
					start_pos = current_pos
					all_text += cleaned_text + " "
					current_pos = len(all_text)
					page_positions.append((start_pos, current_pos, page_num))

			# Split text using semantic chunker
			chunks = text_splitter.split_text(all_text)

			# Process each chunk and determine its page range
			for chunk_idx, chunk in enumerate(chunks):
				chunk_start = all_text.find(chunk)
				chunk_end = chunk_start + len(chunk)

				# Find pages that overlap with this chunk
				chunk_pages = set()
				for start_pos, end_pos, page_num in page_positions:
					if (chunk_start <= end_pos and chunk_end >= start_pos):
						chunk_pages.add(page_num)

				if chunk_pages:
					start_page = min(chunk_pages)
					end_page = max(chunk_pages)
					page_range = f"{start_page}-{end_page}" if start_page != end_page else str(start_page)

					# Add overlap from previous chunk if needed
					if overlap_sent_count > 0 and chunk_idx > 0:
						prev_chunk = chunks[chunk_idx - 1]
						prev_sentences = split_into_sentences(prev_chunk)
						overlap_text = ' '.join(prev_sentences[-overlap_sent_count:])
						if overlap_text and not chunk.startswith(overlap_text):
							chunk = overlap_text + " " + chunk

					input_data.append({
						'id': f"{chunk_id_prefix}_{page_range}_{chunk_idx}",
						'title': blob_name,
						'category': category_id,
						'sourcepage': f"{blob_name}::{page_range}",
						'content': chunk.strip()
					})

		except Exception as e:
			queue_logger.error(f"Error processing file '{blob_name}' with SemanticChunker: {str(e)}")
			return []

	return input_data

In [46]:
category_id = "FormRechonizer" 
blob_name = "TestCase1"
mode = "search" 
language = "en"
blob_Connection_String = "dummy"
blob_container_name = "dummy"
base_threshold = 1000
buffer_percent = 10
overlap_sent_count = 2


import ast
import json

with open("page_map_FormRechonizer.txt", "r") as file:
    page_map_content = file.read()

# If your file is like [(1, 0, 'text'), ...]
page_map = ast.literal_eval(page_map_content)

# Then pass it to your function
result_embedding = create_sections_embedding(
    category_id, blob_name, page_map, mode, language,
    blob_Connection_String, blob_container_name,
    base_threshold, buffer_percent, overlap_sent_count
)

result_chunker= create_sections_chunker(
	category_id, blob_name, page_map, mode, language,
	blob_Connection_String, blob_container_name,
	base_threshold, buffer_percent, overlap_sent_count
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00,  2.34it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.03it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 15.21it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.63it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.76it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.01it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 20.00it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 13.87it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 12.09it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.35it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.28it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 29.33it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 19.25it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 40.50it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 52.52it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.42it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 61.17it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 50.61it/s]
Batches: 1

In [47]:
print(f"************************************* result_chunker *************************************\n{result_chunker}")


************************************* result_chunker *************************************
[{'id': 'TestCase1_1-3_0', 'title': 'TestCase1', 'category': 'FormRechonizer', 'sourcepage': 'TestCase1::1-3', 'content': 'Tall Tales A short story collection by Grade 10 Contents <table><tr><td>Judgement, by Aletta van der Merwe.</td><td>3</td></tr><tr><td>The Three Little Men, by Andrew Kim</td><td>6</td></tr><tr><td>The Examination Day, by Julia Bauerschmidt</td><td>10</td></tr><tr><td>The Businessman, by Dennis Yang.</td><td>11</td></tr><tr><td>The Red Jinn from the Golden Lamp, by Michelle Tham</td><td>14</td></tr><tr><td>The Hunt, by Jalen Cleary.</td><td>18</td></tr><tr><td>Examination Day, by Jessica Yoon</td><td>22</td></tr><tr><td>Mr.Tompkins, by Lauren Zammit.</td><td>27</td></tr><tr><td>The Scope, by Lucas Baumgaertel</td><td>29</td></tr><tr><td>The Training Exercise, by Isaac Eastland</td><td>31</td></tr><tr><td>Berlock Nolmes, by Tomas Branco.</td><td>35</td></tr><tr><td>The Selecti

In [48]:
print(f"************************************* result_embedding *************************************\n{result_embedding}")


************************************* result_embedding *************************************
[{'id': 'TestCase1_1-8_7', 'title': 'TestCase1', 'category': 'FormRechonizer', 'sourcepage': 'TestCase1::1-8', 'content': 'Tall Tales\nA short story collection\nby Grade 10 Yuki, No given name/s". What is my last name?"\nBut even as the words left her lips Alexis already knew the answer. Jack?" I examined the extraordinary way the way Jack was limping to get more packages to deliver. "Delivery for Mr Richard Bronwen!"\nI heard his weak trembling voice as he was trying to make an attempt to find out who I was. Richard ... It was not hard to notice that the short man in the pixelized monitor of the surveillance camera was Jack. Why did you leave me"\n"Jack? "What is this Jack? "\n"Jack, talk to me, where am I? "\n"Jack what are you talking about? As my vision blurred, I saw Jack, looking at me with empathy filled in his eyes. Mr Bronwen?"\nIt was a familiar voice, after stopping my footsteps I th