# Chunking

### Imports

In [9]:
import argparse
import json
import re
from pathlib import Path

import markdown      
from bs4 import BeautifulSoup  

### Defining functions

In [10]:
def split_by_header(md_text: str, header_level: int = 1):
    """
    Split markdown text by a given header level (#, ##, ###, etc.)
    Returns a list of tuples (title, section_text)
    """
    header_pattern = {
        1: r'^\#\s+',
        2: r'^\#\#\s+',
        3: r'^\#\#\#\s+',
    }.get(header_level, r'^\#\s+')

    headers = []
    for m in re.finditer(rf'{header_pattern}(.+)', md_text, flags=re.MULTILINE):
        headers.append((m.start(), m.group(1).strip()))

    chunks = []
    if not headers:
        chunks.append(("", md_text.strip()))
        return chunks

    for i, (start, title) in enumerate(headers):
        end = headers[i + 1][0] if i + 1 < len(headers) else len(md_text)
        section = md_text[start:end].strip()
        chunks.append((title, section))

    return chunks


def has_html_table(html_text: str) -> bool:
    """
    Detect if the given HTML text contains at least one <table> element.
    """
    soup = BeautifulSoup(html_text, "html.parser")
    return bool(soup.find("table"))


def extract_context_sentences(text: str, num_sentences: int = 3) -> str:
    """
    Extract the last N sentences from text to use as context.
    
    Args:
        text: Text to extract sentences from
        num_sentences: Number of sentences to extract (default: 3)
    
    Returns:
        String containing the last N sentences
    """
    # Simple sentence splitting (handles ., !, ?)
    sentence_pattern = r'[.!?]+\s+'
    sentences = re.split(sentence_pattern, text.strip())
    
    # Filter out empty sentences
    sentences = [s.strip() for s in sentences if s.strip()]
    
    # Get last N sentences
    context_sentences = sentences[-num_sentences:] if len(sentences) >= num_sentences else sentences
    
    # Rejoin with proper punctuation
    if context_sentences:
        return '. '.join(context_sentences) + '.'
    return ""


def md_chunk_to_fields(title: str, chunk_md: str, context_text: str = ""):
    """
    Convert one markdown section to structured fields.
    
    Args:
        title: Section title
        chunk_md: Markdown content
        context_text: Optional context text to prepend (for table chunks)
    
    Returns:
        Dictionary with structured fields:
        - raw markdown
        - HTML
        - plain text
        - tables (as HTML and text)
        - concatenated_text (for embedding)
        - is_table: flag indicating if chunk contains a table
        - has_context: flag indicating if context was prepended
    """
    # Detect if this chunk has a table
    is_table = has_table(chunk_md)
    
    # Convert markdown to HTML
    html = markdown.markdown(chunk_md, extensions=['tables', 'fenced_code'])
    soup = BeautifulSoup(html, 'html.parser')

    # Extract tables
    tables_html = []
    tables_text_parts = []
    for tbl in soup.find_all('table'):
        tables_html.append(str(tbl))
        rows = []
        for tr in tbl.find_all('tr'):
            cells = [c.get_text(strip=True) for c in tr.find_all(['th', 'td'])]
            rows.append(" | ".join(cells))
        tables_text_parts.append("\n".join(rows))

    # Remove tables from main HTML for clean text
    for t in soup.find_all('table'):
        t.decompose()
    plain_text = soup.get_text(separator="\n").strip()

    # Build concatenated text for embedding
    # If this is a table chunk and we have context, prepend it
    if is_table and context_text:
        # Prepend context to help embeddings understand table meaning
        concatenated_text = f"Context: {context_text}\n\n{title}\n\n{plain_text}\n\n" + "\n\n".join(tables_text_parts)
        has_context = True
    else:
        concatenated_text = f"{title}\n\n{plain_text}\n\n" + "\n\n".join(tables_text_parts)
        has_context = False

    return {
        "title": title,
        "raw_markdown": chunk_md,
        "html": html,
        "text": plain_text,
        "tables_html": tables_html,
        "tables_text": tables_text_parts,
        "concatenated_text": concatenated_text.strip(),
        "is_table": is_table,
        "has_context": has_context,
        "context": context_text if has_context else ""
    }


def process_chunks_with_context(sections: list, num_context_sentences: int = 3) -> list:
    """
    Process all chunks and add context to table chunks from previous chunks.
    
    Args:
        sections: List of (title, content) tuples
        num_context_sentences: Number of sentences to extract as context
    
    Returns:
        List of processed chunk dictionaries
    """
    data = []
    
    for i, (title, content) in enumerate(sections):
        # Check if this chunk contains a table
        if has_table(content):
            # Extract context from previous chunks (up to 3 chunks back)
            context_parts = []
            for j in range(max(0, i - 3), i):
                prev_title, prev_content = sections[j]
                # Extract text content (without the markdown table markup)
                prev_html = markdown.markdown(prev_content, extensions=['tables', 'fenced_code'])
                prev_soup = BeautifulSoup(prev_html, 'html.parser')
                # Remove tables from previous chunk
                for tbl in prev_soup.find_all('table'):
                    tbl.decompose()
                prev_text = prev_soup.get_text(separator=" ").strip()
                
                if prev_text:
                    # Extract last few sentences from this previous chunk
                    sentences = extract_context_sentences(prev_text, num_context_sentences)
                    if sentences:
                        context_parts.append(sentences)
            
            # Use the most recent context (last chunk with text)
            context = context_parts[-1] if context_parts else ""
        else:
            context = ""
        
        # Process the chunk with context
        record = md_chunk_to_fields(title, content, context)
        record["id"] = i + 1
        data.append(record)
    
    return data

### Loading markdown file

In [11]:
input_file = Path("/home/codepips/Home/Portfolio/Projects/ŸÖÿ≥ÿßÿ±/data/processed/MD/TBS_Handbook-2022.md")  
md_text = input_file.read_text(encoding="utf-8")
print(md_text[:500])  # preview first 500 chars


Educating to Lead

Ministry of Higher Education and Scientific Research University of Tunis

Tunis Business School

‚ÄúEducating Future Leaders and Managers for a Global Economy‚Äù

SCHOOL HANDBOOK

Version: September, 2022

Last update: February 5, 2023

# DISCLAIMER

This Handbook provides information about the school, its programs, guidelines, and regulations. It has been approved by the Scientific Council. It is the only body in the school that can formally modify this handbook.

Tunis Business 


### Splitting markdown by \#

In [12]:
sections = split_by_header(md_text, header_level=1)
print(f"Found {len(sections)} sections")

# Preview first section title
sections[0][0]


Found 88 sections


'DISCLAIMER'

### Converting chunks into JSON

In [13]:
# Process chunks with context for tables
data = process_chunks_with_context(sections, num_context_sentences=3)

print(f"Processed {len(data)} chunks")

# Show statistics
table_chunks = [d for d in data if d['is_table']]
chunks_with_context = [d for d in data if d['has_context']]

print(f"\nüìä Statistics:")
print(f"   Total chunks: {len(data)}")
print(f"   Table chunks: {len(table_chunks)}")
print(f"   Chunks with context: {len(chunks_with_context)}")

# Show example of a table chunk with context
if chunks_with_context:
    print(f"\nüìã Example table chunk with context:")
    example = chunks_with_context[0]
    print(f"   ID: {example['id']}")
    print(f"   Title: {example['title']}")
    print(f"   Context: {example['context'][:150]}...")
    print(f"   Has table: {example['is_table']}")

Processed 88 chunks

üìä Statistics:
   Total chunks: 88
   Table chunks: 0
   Chunks with context: 0


In [14]:
import json
print(json.dumps(data[0], indent=2)[:800])


{
  "title": "DISCLAIMER",
  "raw_markdown": "# DISCLAIMER\n\nThis Handbook provides information about the school, its programs, guidelines, and regulations. It has been approved by the Scientific Council. It is the only body in the school that can formally modify this handbook.\n\nTunis Business School reserves the right to amend any policy at any time. The most updated version is the online version (updated on 5 February 2023). It is the responsibility of the students to be familiar with the content of this handbook.",
  "html": "<h1>DISCLAIMER</h1>\n<p>This Handbook provides information about the school, its programs, guidelines, and regulations. It has been approved by the Scientific Council. It is the only body in the school that can formally modify this handbook.</p>\n<p>Tunis Busine


In [15]:
output_file = Path("chunks.jsonl")
with output_file.open("w", encoding="utf-8") as f:
    for record in data:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"‚úÖ Saved {len(data)} chunks to {output_file}")


‚úÖ Saved 88 chunks to chunks.jsonl
