In [4]:
import os
import re
import numpy as np
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    PythonCodeTextSplitter,
    
)
from typing import List

from sentence_transformers import SentenceTransformer

# --- HuggingFace embedding model setup ---
# Load MiniLM v2 for embeddings once
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



# --- Language to code splitter mapping ---
code_splitter_map = {
    'python': PythonCodeTextSplitter,
    
}

def split_text_by_language(text: str, language: str, chunk_size=1000, chunk_overlap=200) -> List[str]:
    """
    Split text using language-specific code splitters or generic RecursiveCharacterTextSplitter for natural language.
    """
    language = language.lower()

    if language in code_splitter_map:
        splitter_cls = code_splitter_map[language]
        splitter = splitter_cls(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    else:
        # Use generic splitter for natural languages or unknown code languages
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
        )
    return splitter.split_text(text)


def create_sections_code(
    category_id, blob_name, page_map, mode, language,
    blob_Connection_String, blob_container_name,
    base_threshold, buffer_percent, overlap_sent_count
):
    chunk_id_prefix = blob_name.replace(" ", "_").replace(".", "_")
    input_data = []

    try:
        # Combine all page texts
        all_text = ""
        page_positions = []  # (start_pos, end_pos, page_num)
        current_pos = 0
        for page_num, _, text in page_map:
            cleaned_text = re.sub(r'\s+', ' ', text).strip()
            if cleaned_text:
                start_pos = current_pos
                all_text += cleaned_text + "\n\n"
                current_pos = len(all_text)
                page_positions.append((start_pos, current_pos, page_num))

        chunk_overlap = int(base_threshold * buffer_percent / 100)

        # Split text by language
        chunks = split_text_by_language(all_text, language, base_threshold, chunk_overlap)

        def split_into_sentences(text):
            import re
            sentence_endings = re.compile(r'(?<=[.!?]) +')
            return sentence_endings.split(text)

        for chunk_idx, chunk in enumerate(chunks):
            chunk_start = all_text.find(chunk)
            chunk_end = chunk_start + len(chunk)

            chunk_pages = set()
            for start_pos, end_pos, page_num in page_positions:
                if (chunk_start <= end_pos and chunk_end >= start_pos):
                    chunk_pages.add(page_num)

            if chunk_pages:
                start_page = min(chunk_pages)
                end_page = max(chunk_pages)
                page_range = f"{start_page}-{end_page}" if start_page != end_page else str(start_page)

                # Add overlap sentences from previous chunk if needed
                if overlap_sent_count > 0 and chunk_idx > 0:
                    prev_chunk = chunks[chunk_idx - 1]
                    prev_sentences = split_into_sentences(prev_chunk)
                    overlap_text = ' '.join(prev_sentences[-overlap_sent_count:])
                    if overlap_text and not chunk.startswith(overlap_text):
                        chunk = overlap_text + " " + chunk

                cleaned_chunk = re.sub(r'\n{3,}', '\n\n', chunk.strip())

                # Get embedding using HF model
                #embedding = get_hf_embedding(cleaned_chunk)

                input_data.append(cleaned_chunk)

    except Exception as e:
        print(f"Error processing file '{blob_name}': {e}")
        return []

    return input_data

# --------- Placeholder for Groq LLM call ---------
# You can implement a separate function to call Groq API when you need LLM functionality,
# For example:




In [5]:
category_id = "FormRechonizer" 
blob_name = "TestCase1"
mode = "search" 
language = "en"
blob_Connection_String = "dummy"
blob_container_name = "dummy"
base_threshold = 1000
buffer_percent = 10
overlap_sent_count = 2


import ast
import json

with open("page_map.txt", "r") as file:
    page_map_content = file.read()

# If your file is like [(1, 0, 'text'), ...]
#page_map = ast.literal_eval(page_map_content)
page_map = [(1, 0, page_map_content)]

# Then pass it to your function
result_embedding = create_sections_code(
    category_id, blob_name, page_map, mode, language,
    blob_Connection_String, blob_container_name,
    base_threshold, buffer_percent, overlap_sent_count
)

In [6]:
print(f"************************************* result_embedding *************************************\n{result_embedding}")


************************************* result_embedding *************************************
